This data was read from lacZ sequencing data compiled across multiple studies.
snvs <- read_delim("data/raw/SNV_data.txt", delim = "\t")
ins <- read_delim("data/raw/Insertion_data.txt", delim = "\t")
del <- read_delim("data/raw/Deletion_data.txt", delim = "\t")
historical <- read_delim("data/raw/Historical_data.txt", delim = "\t")
snvs$Type <- "SNV"
ins$Type <- "Insertion"
del$Type <- "Deletion"
snvs$Study <- "This Study"
ins$Study <- "This Study"
del$Study <- "This Study"
snvs$Technology <- "NGS"
ins$Technology <- "NGS"
del$Technology <- "NGS"
historical$Technology <- "Sanger"
historical <- historical %>%
mutate(Type = case_when(
Deletion == 1 ~ "Deletion",
Insertion == 1 ~ "Insertion",
str_detect(Alt, regex("del", ignore_case = TRUE)) ~ "Deletion",
str_length(Ref) < str_length(Alt) ~ "Insertion",
str_length(Ref) > str_length(Alt) ~ "Deletion",
str_length(Ref) > 1 & str_length(Ref) == str_length(Alt) ~ "Complex",
!str_detect(Alt, regex("del", ignore_case = TRUE)) ~ "SNV"
)) %>%
mutate(Codon = as.numeric(Codon))
snvs_ins_del_clean <- snvs %>%
dplyr::full_join(ins, keep = F, na_matches = "never") %>%
dplyr::full_join(del, keep = F, na_matches = "never") %>%
dplyr::full_join(historical,
keep = F, na_matches = "never"
) %>%
dplyr::mutate(mutation = paste0(Ref, ">", Alt)) %>%
dplyr::mutate(aa_change = paste0(`Ref A.A.`, ">", `Alt A.A.`)) %>%
mutate(FunctionalChange = ifelse(test = `Ref A.A.` == `Alt A.A.`,
yes = 0,
no = 1
)) %>%
dplyr::mutate(Ref = str_to_upper(Ref), Alt = str_to_upper(Alt)) %>%
mutate(codon_position = ((Position - 1) %% 3) + 1)
# Rewrite position as relative to lacZ reference
# Get index of pre-insertion
index <- snvs_ins_del_clean["Position"] < 26
# Transform by subtracting 3
snvs_ins_del_clean$PositionRef[index] <- snvs_ins_del_clean$Position[index] - 3
# Get index post-insertion
index <- snvs_ins_del_clean["Position"] > 40
# Subtract 18
snvs_ins_del_clean$PositionRef[index] <- snvs_ins_del_clean$Position[index] - 18
# Add amino acid 3 letter code
snvs_ins_del_clean <- snvs_ins_del_clean %>%
dplyr::mutate(residue_code = Biostrings::AMINO_ACID_CODE[
`Ref A.A.` # this still won't work for multi-codon mutations
]) %>%
dplyr::mutate(alt_code = Biostrings::AMINO_ACID_CODE[
`Alt A.A.` # this still won't work for multi-codon mutations
]) %>%
dplyr::mutate(CodonRef = ceiling(PositionRef / 3))
# Also add complete residue name
snvs_ins_del_clean$residue_name <- paste(snvs_ins_del_clean$residue_code, snvs_ins_del_clean$Codon, sep = "")
# Add domain information
# Sugar Binding (49-219; PF02837)
# β-Galactosidase (221-334; PF00703)
# TIM Barrel (336-630; PF02836)
# β-Galactosidase Small Chain (749-1022; PF02929)
# domain_breakpoints_nuc <- c(49*3, 219*3, 221*3, 334*3, 336*3, 630*3, 749*3, 1022*3)
d1 <- dplyr::between(snvs_ins_del_clean$Codon, 49, 219)
d2 <- dplyr::between(snvs_ins_del_clean$Codon, 221, 334)
d3 <- dplyr::between(snvs_ins_del_clean$Codon, 336, 630)
d4 <- dplyr::between(snvs_ins_del_clean$Codon, 749, 1022)
snvs_ins_del_clean$Domain[d1] <- "Sugar Binding (PF02837)"
snvs_ins_del_clean$Domain[d2] <- "β-Galactosidase (PF00703)"
snvs_ins_del_clean$Domain[d3] <- "TIM Barrel (PF02836)"
snvs_ins_del_clean$Domain[d4] <- "β-Galactosidase Small Chain (PF02929)"
snvs_ins_del_clean %>%
dplyr::group_by(Domain) %>%
tally()
## # A tibble: 5 × 2
## Domain n
## <chr> <int>
## 1 Sugar Binding (PF02837) 901
## 2 TIM Barrel (PF02836) 2124
## 3 β-Galactosidase (PF00703) 384
## 4 β-Galactosidase Small Chain (PF02929) 1401
## 5 <NA> 1655
# Domain n
# 1 Sugar Binding (PF02837) 901
# 2 TIM Barrel (PF02836) 2124
# 3 β-Galactosidase (PF00703) 384
# 4 β-Galactosidase Small Chain (PF02929) 1401
# 5 NA 1655
historical_clean <- snvs_ins_del_clean %>%
dplyr::filter(!Study == "This Study")
duplicated <- snvs_ins_del_clean %>% duplicated()
duplicates <- snvs_ins_del_clean[duplicated, ]
snvs_ins_del_collapsed <- snvs_ins_del_clean %>%
distinct() # This will collapse duplicated lines, i.e., same mutation but
# from different animals/samples
knitr::kable(snvs_ins_del_clean %>% head())
| Exposure | Tissue | Dose | Position | Ref | Alt | Tech Rep1 | Tech Rep2 | Tech Difference | Background | Avg Freq | Count | A:T to G:C | G:C to A:T | G:C to T:A | G:C to C:G | A:T to T:A | A:T to C:G | Insertion | Deletion | Codon | Consequence | Ref Codon | Alt Codon | Ref A.A. | Alt A.A. | Type | Study | Technology | Tech Diff | mutation | aa_change | FunctionalChange | codon_position | PositionRef | residue_code | alt_code | CodonRef | residue_name | Domain |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| BaP | Bone Marrow | 100 | 19 | T | C | 0.0189 | 0.0261 | 1.3829 | 0.0279 | 0.0225 | 9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | missense | TCA | CCA | S | P | SNV | This Study | NGS | NA | T>C | S>P | 1 | 1 | 16 | Ser | Pro | 6 | Ser7 | NA |
| BaP | Bone Marrow | 100 | 19 | T | C | 0.0159 | 0.0222 | 1.3964 | 0.0279 | 0.0190 | 9 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | missense | TCA | CCA | S | P | SNV | This Study | NGS | NA | T>C | S>P | 1 | 1 | 16 | Ser | Pro | 6 | Ser7 | NA |
| BaP | Bone Marrow | 100 | 41 | C | A | 0.0035 | 0.0038 | 1.0731 | 0.0002 | 0.0036 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 | missense | CCC | CAC | P | H | SNV | This Study | NGS | NA | C>A | P>H | 1 | 2 | 23 | Pro | His | 8 | Pro8 | NA |
| BaP | Bone Marrow | 100 | 51 | A | C | 0.0025 | 0.0022 | 1.1190 | 0.0042 | 0.0024 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 11 | missense | TTA | TTC | L | F | SNV | This Study | NGS | NA | A>C | L>F | 1 | 3 | 33 | Leu | Phe | 11 | Leu11 | NA |
| BaP | Bone Marrow | 100 | 109 | C | A | 0.0044 | 0.0037 | 1.1947 | 0.0013 | 0.0041 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 31 | missense | CCC | ACC | P | T | SNV | This Study | NGS | NA | C>A | P>T | 1 | 1 | 91 | Pro | Thr | 31 | Pro31 | NA |
| BaP | Bone Marrow | 100 | 110 | C | T | 0.0045 | 0.0052 | 1.1395 | 0.0051 | 0.0048 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 31 | missense | CCC | CTC | P | L | SNV | This Study | NGS | NA | C>T | P>L | 1 | 2 | 92 | Pro | Leu | 31 | Pro31 | NA |
First, some plots to look at the composition of the data.
By position:
# Number of mutations by position (grouped in histogram)
snvs_ins_del_clean %>%
group_by(Position) %>%
tally() %>%
ggplot(aes(x = Position)) +
geom_histogram() +
ggtitle("Number of mutations by position (grouped in histogram)")
# Number of mutations by position, high granularity
snvs_ins_del_clean %>%
group_by(Position, Alt) %>%
tally() %>%
ggplot(aes(x = Position, y = n)) +
geom_bar(stat = "identity") +
ggtitle("Number of mutations by position, high granularity")
# Number of mutations by position, colored by type
snvs_ins_del_clean %>%
group_by(Position, Type, Alt) %>%
tally() %>%
ggplot(aes(x = Position, y = n, fill = Type)) +
geom_bar(stat = "identity") +
ggtitle("Number of mutations by position, colored by type")
# Number of mutations by position, colored by type, log scale
snvs_ins_del_clean %>%
group_by(Position, Type, Alt) %>%
tally() %>%
ggplot(aes(x = Position, y = log2(n + 1), fill = Type)) +
geom_bar(stat = "identity") +
ggtitle("Number of mutations by position, colored by type, log scale")
This section explores the numbers of singletons, etc.
# Number of total mutations at a given position,
# including historical data, and including indels
# This removes mutations observed in multiple samples.
snvs_ins_del_clean %>%
group_by(Position, Alt) %>%
unique() %>%
tally() %>%
pull(n) %>%
sum()
## [1] 6152
snvs_ins_del_clean %>%
group_by(Position, Alt) %>%
tally() %>%
ggplot(aes(x = factor(n))) +
geom_histogram(stat = "count") +
stat_count(
geom = "text",
color = "black",
aes(label = ..count.., angle = 0),
size = 2,
position = position_stack(vjust = 1.1)
) +
xlab("Number of times mutation observed") +
ggtitle("Mutations per position: all mutations")
# Number of each mutation (position, alt base)
# including multiple counts of individual mutants
# This won't be reported but is here for curiosity.
snvs_ins_del_clean %>%
group_by(Position, Alt) %>%
mutate(num_occurrences_includes_counts = sum(Count)) %>%
select(Position, num_occurrences_includes_counts) %>%
unique() %>%
group_by(num_occurrences_includes_counts) %>%
tally() %>%
pull(n) %>%
sum()
## [1] 2171
snvs_ins_del_clean %>%
group_by(Position, Alt) %>%
mutate(num_occurrences_includes_counts = sum(Count)) %>%
select(Position, num_occurrences_includes_counts) %>%
unique() %>%
ggplot(aes(x = factor(num_occurrences_includes_counts))) +
geom_histogram(stat = "count") +
stat_count(
geom = "text",
color = "black",
aes(label = ..count.., angle = 0),
size = 2,
position = position_stack(vjust = 1.1)
) +
xlab("Number of times mutation observed") +
ggtitle("All mutations")
# Number of mutations of various types per nucleotide
snvs_ins_del_clean %>%
group_by(Position, Type, Alt) %>%
tally() %>%
ggplot(aes(x = factor(n), fill = Type)) +
geom_histogram(stat = "count") +
stat_count(
geom = "text",
color = "black",
aes(label = ..count.., angle = 0),
size = 2,
position = position_stack(vjust = 0.9)
) +
xlab("Number of times mutation observed") +
ggtitle("Mutations by type: group by position and alternate allele")
# Number of mutations of various types per nucleotide
snvs_ins_del_clean %>%
group_by(Position, Type) %>%
tally() %>%
ggplot(aes(x = factor(n), fill = Type)) +
geom_histogram(stat = "count") +
stat_count(
geom = "text",
color = "black",
aes(label = ..count.., angle = 0),
size = 2,
position = position_stack(vjust = 0.9)
) +
xlab("Number of times mutation observed") +
ggtitle("Mutations by type: group by position")
# Number of mutations of various types per nucleotide
snvs_ins_del_clean %>%
group_by(Position, Type, Alt, Tissue, Dose, Exposure) %>%
tally() %>%
ggplot(aes(x = factor(n), fill = Type)) +
geom_histogram(stat = "count") +
stat_count(
geom = "text",
color = "black",
aes(label = ..count.., angle = 0),
size = 2,
position = position_stack(vjust = 0.9)
) +
xlab("Number of times mutation observed") +
ggtitle("Mutations by type: group by position, alternate allele, tissue, dose, and exposure")
# Number of mutations per codon
snvs_ins_del_clean %>%
filter(Type == "SNV") %>%
group_by(Codon, Type, `Alt Codon`) %>%
drop_na(Codon) %>%
tally() %>%
ggplot(aes(x = factor(n), fill = Type)) +
geom_histogram(binwidth = 1, stat = "count") +
stat_count(
geom = "text",
color = "white",
aes(label = ..count.., angle = 0),
size = 2,
position = position_stack(vjust = 0.5)
) +
xlab("Number of times mutation observed") +
ggtitle("Number of mutations per codon")
Here are some numbers to answer the question of how many singletons vs repeated mutations were observed in the data, broken down in various ways:
# Number of mutations in total
snvs_ins_del_clean %>% tally() # 6,465
## # A tibble: 1 × 1
## n
## <int>
## 1 6465
# Number of mutations per chemical
snvs_ins_del_clean %>%
group_by(Exposure) %>%
tally()
## # A tibble: 15 × 2
## Exposure n
## <chr> <int>
## 1 BaP 1919
## 2 BaP-IU 1008
## 3 CEDU 14
## 4 Control 1727
## 5 EMS 11
## 6 ENU 809
## 7 Ercc1 -/m 34
## 8 NDBzA 80
## 9 NDMA 47
## 10 PRC 205
## 11 Sunlight 64
## 12 TEM 258
## 13 UVB 120
## 14 X-ray 91
## 15 Xpa -/- 78
# Exposure n
# 1 BaP 1919
# 2 BaP-IU 1008
# 3 CEDU 14
# 4 Control 1727
# 5 EMS 11
# 6 ENU 809
# 7 Ercc1 -/m 34
# 8 NDBzA 80
# 9 NDMA 47
# 10 PRC 205
# 11 Sunlight 64
# 12 TEM 258
# 13 UVB 120
# 14 X-ray 91
# 15 Xpa -/- 78
# Number of mutations by type
snvs_ins_del_clean %>%
group_by(Type) %>%
tally()
## # A tibble: 4 × 2
## Type n
## <chr> <int>
## 1 Complex 20
## 2 Deletion 1080
## 3 Insertion 218
## 4 SNV 5147
# Type n
# 1 Complex 20
# 2 Deletion 1080
# 3 Insertion 218
# 4 SNV 5147
# How many singleton mutations?
snvs_ins_del_clean %>%
group_by(Position, Alt) %>%
tally() %>%
filter(n == 1) %>%
dplyr::rename(Number_of_times_observed = n) %>%
ungroup() %>%
count() # 1,225
## # A tibble: 1 × 1
## n
## <int>
## 1 1225
# tally() %>% pull(n) %>% sum()
# How many non-singleton mutations?
snvs_ins_del_clean %>%
group_by(Position, Alt) %>%
tally() %>%
filter(n > 1) %>%
group_by(n) %>%
dplyr::rename(Number_of_times_observed = n) %>%
tally() %>%
pull(n) %>%
sum() # There are 946
## [1] 946
# How many singleton mutations, SNVs only?
snvs_ins_del_clean %>%
filter(Type == "SNV") %>%
group_by(Position, Alt) %>%
tally() %>%
filter(n == 1) %>%
group_by(n) %>%
dplyr::rename(Number_of_times_observed = n) %>%
tally() %>%
pull(n) %>%
sum() # There are 707
## [1] 707
# How many non-singleton mutations, SNVs only?
snvs_ins_del_clean %>%
filter(Type == "SNV") %>%
group_by(Position, Alt) %>%
tally() %>%
filter(n > 1) %>%
group_by(n) %>%
dplyr::rename(Number_of_times_observed = n) %>%
tally() %>%
pull(n) %>%
sum() # There are 690
## [1] 690
# How many singleton mutations, deletions only?
snvs_ins_del_clean %>%
filter(Type == "Deletion") %>%
group_by(Position, Alt) %>%
tally() %>%
filter(n == 1) %>%
group_by(n) %>%
dplyr::rename(Number_of_times_observed = n) %>%
tally() %>%
pull(n) %>%
sum() # There are 377
## [1] 377
# How many non-singleton mutations, deletions only?
snvs_ins_del_clean %>%
filter(Type == "Deletion") %>%
group_by(Position, Alt) %>%
tally() %>%
filter(n > 1) %>%
group_by(n) %>%
dplyr::rename(Number_of_times_observed = n) %>%
tally() %>%
pull(n) %>%
sum() # There are 222
## [1] 222
# How many singleton mutations, insertions only?
snvs_ins_del_clean %>%
filter(Type == "Insertion") %>%
group_by(Position, Alt) %>%
tally() %>%
filter(n == 1) %>%
group_by(n) %>%
dplyr::rename(Number_of_times_observed = n) %>%
tally() %>%
pull(n) %>%
sum() # There are 169
## [1] 169
# How many non-singleton mutations, insertions only?
snvs_ins_del_clean %>%
filter(Type == "Insertion") %>%
group_by(Position, Alt) %>%
tally() %>%
filter(n > 1) %>%
group_by(n) %>%
dplyr::rename(Number_of_times_observed = n) %>%
tally() %>%
pull(n) %>%
sum() # There are 20
## [1] 20
# Codons, singletons
snvs_ins_del_clean %>%
filter(Type == "SNV") %>%
group_by(Codon, `Alt Codon`) %>%
drop_na(Codon) %>%
tally() %>%
group_by(n) %>%
filter(n == 1) %>%
dplyr::rename(Number_of_times_observed = n) %>%
tally() %>%
pull(n) %>%
sum()
## [1] 708
# Codons, non-singletons
snvs_ins_del_clean %>%
filter(Type == "SNV") %>%
group_by(Codon, `Alt Codon`) %>%
drop_na(Codon) %>%
tally() %>%
group_by(n) %>%
filter(n > 1) %>%
dplyr::rename(Number_of_times_observed = n) %>%
tally() %>%
pull(n) %>%
sum()
## [1] 690
# Codons, number of unique loci (codons) that are mutated more than once
snvs_ins_del_clean %>%
filter(Type == "SNV") %>%
group_by(Codon) %>%
drop_na(Codon) %>%
tally() %>%
group_by(n) %>%
filter(n > 1) %>%
dplyr::rename(Number_of_times_observed = n) %>%
tally() %>%
pull(n) %>%
sum()
## [1] 498
# Total number of codons that are mutated
snvs_ins_del_clean %>%
drop_na(Codon) %>%
pull(Codon) %>%
unique() %>%
length()
## [1] 691
snvs_ins_del_clean %>%
filter(Type == "SNV") %>%
drop_na(Codon) %>%
pull(Codon) %>%
unique() %>%
length()
## [1] 682
snvs_ins_del_clean %>%
group_by(Codon) %>%
tally() %>%
group_by(n) %>%
filter(n > 1) %>%
dplyr::rename(Number_of_times_observed = n) %>%
tally() %>%
pull(n) %>%
sum()
## [1] 509
Compare the MutaMouse lacZ sequence to the reference sequence. This also
# Manually downloaded...
# laczref <- readDNAStringSet("data/raw/lacZ.reference.paper.fa", format="fasta",
# use.names=TRUE, with.qualities=FALSE)
################################################################################
# Load sequence from NCBI
################################################################################
betagal <- esearch("beta galactosidase", "protein")
betagal_x <- efetch(betagal, rettype = "fasta", retmode = "xml")
tmp <- tempfile()
lacZ_NCBI <- efetch(
uid = "V00296.1",
db = "nucleotide",
retmode = "text",
rettype = "fasta",
outfile = tmp
)
laczref <- readDNAStringSet(tmp)
lacz_mutamouse <- readDNAStringSet("data/raw/lacZ.fa",
format = "fasta",
use.names = TRUE,
with.qualities = FALSE
)
alignment <- msa(c(laczref, lacz_mutamouse))
## use default substitution matrix
print(alignment)
## CLUSTAL 2.1
##
## Call:
## msa(c(laczref, lacz_mutamouse))
##
## MsaDNAMultipleAlignment with 2 rows and 3096 columns
## aln names
## [1] ---ACCATGATTACGGATTCACTGG-...GGTCTGGTGTCAAAAATAATAATAA V00296.1 E. coli ...
## [2] ATGACCATGATTACGGATTCACTGGA...GGTCTGGTGTCAAAAATAATAATAA lacZ
## Con ???ACCATGATTACGGATTCACTGG?...GGTCTGGTGTCAAAAATAATAATAA Consensus
laczref_aa <- Biostrings::translate(laczref)
lacz_mutamouse_aa <- Biostrings::translate(lacz_mutamouse)
alignment <- msa(c(laczref, lacz_mutamouse))
## use default substitution matrix
print(alignment, show = "complete", showNames = T, type = "upperlower")
##
## MsaDNAMultipleAlignment with 2 rows and 3096 columns
## aln (1..54) names
## [1] ---ACCATGATTACGGATTCACTGG---------------CCGTCGTTTTACAA V00296.1 E. coli ...
## [2] ATGACCATGATTACGGATTCACTGGAATTCCCGGGGATCCCCGTCGTTTTACAA lacZ
## Con ...................................................... Consensus
##
## aln (55..108) names
## [1] CGTCGTGACTGGGAAAACCCTGGCGTTACCCAACTTAATCGCCTTGCAGCACAT V00296.1 E. coli ...
## [2] CGTCGTGACTGGGAAAACCCTGGCGTTACCCAACTTAATCGCCTTGCAGCACAT lacZ
## Con ...................................................... Consensus
##
## aln (109..162) names
## [1] CCCCCTTTCGCCAGCTGGCGTAATAGCGAAGAGGCCCGCACCGATCGCCCTTCC V00296.1 E. coli ...
## [2] CCCCCTTTCGCCAGCTGGCGTAATAGCGAAGAGGCCCGCACCGATCGCCCTTCC lacZ
## Con ...................................................... Consensus
##
## aln (163..216) names
## [1] CAACAGTTGCGCAGCCTGAATGGCGAATGGCGCTTTGCCTGGTTTCCGGCACCA V00296.1 E. coli ...
## [2] CAACAGTTGCGCAGCCTGAATGGCGAATGGCGCTTTGCCTGGTTTCCGGCACCA lacZ
## Con ...................................................... Consensus
##
## aln (217..270) names
## [1] GAAGCGGTGCCGGAAAGCTGGCTGGAGTGCGATCTTCCTGAGGCCGATACTGTC V00296.1 E. coli ...
## [2] GAAGCGGTGCCGGAAAGCTGGCTGGAGTGCGATCTTCCTGAGGCCGATACTGTC lacZ
## Con ...................................................... Consensus
##
## aln (271..324) names
## [1] GTCGTCCCCTCAAACTGGCAGATGCACGGTTACGATGCGCCCATCTACACCAAC V00296.1 E. coli ...
## [2] GTCGTCCCCTCAAACTGGCAGATGCACGGTTACGATGCGCCCATCTACACCAAC lacZ
## Con ...................................................... Consensus
##
## aln (325..378) names
## [1] GTAACCTATCCCATTACGGTCAATCCGCCGTTTGTTCCCACGGAGAATCCGACG V00296.1 E. coli ...
## [2] GTGACCTATCCCATTACGGTCAATCCGCCGTTTGTTCCCACGGAGAATCCGACG lacZ
## Con ...................................................... Consensus
##
## aln (379..432) names
## [1] GGTTGTTACTCGCTCACATTTAATGTTGATGAAAGCTGGCTACAGGAAGGCCAG V00296.1 E. coli ...
## [2] GGTTGTTACTCGCTCACATTTAATGTTGATGAAAGCTGGCTACAGGAAGGCCAG lacZ
## Con ...................................................... Consensus
##
## aln (433..486) names
## [1] ACGCGAATTATTTTTGATGGCGTTAACTCGGCGTTTCATCTGTGGTGCAACGGG V00296.1 E. coli ...
## [2] ACGCGAATTATTTTTGATGGCGTTAACTCGGCGTTTCATCTGTGGTGCAACGGG lacZ
## Con ...................................................... Consensus
##
## aln (487..540) names
## [1] CGCTGGGTCGGTTACGGCCAGGACAGTCGTTTGCCGTCTGAATTTGACCTGAGC V00296.1 E. coli ...
## [2] CGCTGGGTCGGTTACGGCCAGGACAGTCGTTTGCCGTCTGAATTTGACCTGAGC lacZ
## Con ...................................................... Consensus
##
## aln (541..594) names
## [1] GCATTTTTACGCGCCGGAGAAAACCGCCTCGCGGTGATGGTGCTGCGTTGGAGT V00296.1 E. coli ...
## [2] GCATTTTTACGCGCCGGAGAAAACCGCCTCGCGGTGATGGTGCTGCGCTGGAGT lacZ
## Con ...................................................... Consensus
##
## aln (595..648) names
## [1] GACGGCAGTTATCTGGAAGATCAGGATATGTGGCGGATGAGCGGCATTTTCCGT V00296.1 E. coli ...
## [2] GACGGCAGTTATCTGGAAGATCAGGATATGTGGCGGATGAGCGGCATTTTCCGT lacZ
## Con ...................................................... Consensus
##
## aln (649..702) names
## [1] GACGTCTCGTTGCTGCATAAACCGACTACACAAATCAGCGATTTCCATGTTGCC V00296.1 E. coli ...
## [2] GACGTCTCGTTGCTGCATAAACCGACTACACAAATCAGCGATTTCCATGTTGCC lacZ
## Con ...................................................... Consensus
##
## aln (703..756) names
## [1] ACTCGCTTTAATGATGATTTCAGCCGCGCTGTACTGGAGGCTGAAGTTCAGATG V00296.1 E. coli ...
## [2] ACTCGCTTTAATGATGATTTCAGCCGCGCTGTACTGGAGGCTGAAGTTCAGATG lacZ
## Con ...................................................... Consensus
##
## aln (757..810) names
## [1] TGCGGCGAGTTGCGTGACTACCTACGGGTAACAGTTTCTTTATGGCAGGGTGAA V00296.1 E. coli ...
## [2] TGCGGCGAGTTGCGTGACTACCTACGGGTAACAGTTTCTTTATGGCAGGGTGAA lacZ
## Con ...................................................... Consensus
##
## aln (811..864) names
## [1] ACGCAGGTCGCCAGCGGCACCGCGCCTTTCGGCGGTGAAATTATCGATGAGCGT V00296.1 E. coli ...
## [2] ACGCAGGTCGCCAGCGGCACCGCGCCTTTCGGCGGTGAAATTATCGATGAGCGT lacZ
## Con ...................................................... Consensus
##
## aln (865..918) names
## [1] GGTGGTTATGCCGATCGCGTCACACTACGTCTGAACGTCGAAAACCCGAAACTG V00296.1 E. coli ...
## [2] GGTGGTTATGCCGATCGCGTCACACTACGTCTGAACGTCGAAAACCCGAAACTG lacZ
## Con ...................................................... Consensus
##
## aln (919..972) names
## [1] TGGAGCGCCGAAATCCCGAATCTCTATCGTGCGGTGGTTGAACTGCACACCGCC V00296.1 E. coli ...
## [2] TGGAGCGCCGAAATCCCGAATCTCTATCGTGCGGTGGTTGAACTGCACACCGCC lacZ
## Con ...................................................... Consensus
##
## aln (973..1026) names
## [1] GACGGCACGCTGATTGAAGCAGAAGCCTGCGATGTCGGTTTCCGCGAGGTGCGG V00296.1 E. coli ...
## [2] GACGGCACGCTGATTGAAGCAGAAGCCTGCGATGTCGGTTTCCGCGAGGTGCGG lacZ
## Con ...................................................... Consensus
##
## aln (1027..1080) names
## [1] ATTGAAAATGGTCTGCTGCTGCTGAACGGCAAGCCGTTGCTGATTCGAGGCGTT V00296.1 E. coli ...
## [2] ATTGAAAATGGTCTGCTGCTGCTGAACGGCAAGCCGTTGCTGATTCGAGGCGTT lacZ
## Con ...................................................... Consensus
##
## aln (1081..1134) names
## [1] AACCGTCACGAGCATCATCCTCTGCATGGTCAGGTCATGGATGAGCAGACGATG V00296.1 E. coli ...
## [2] AACCGTCACGAGCATCATCCTCTGCATGGTCAGGTCATGGATGAGCAGACGATG lacZ
## Con ...................................................... Consensus
##
## aln (1135..1188) names
## [1] GTGCAGGATATCCTGCTGATGAAGCAGAACAACTTTAACGCCGTGCGCTGTTCG V00296.1 E. coli ...
## [2] GTGCAGGATATCCTGCTGATGAAGCAGAACAACTTTAACGCCGTGCGCTGTTCG lacZ
## Con ...................................................... Consensus
##
## aln (1189..1242) names
## [1] CATTATCCGAACCATCCGCTGTGGTACACGCTGTGCGACCGCTACGGCCTGTAT V00296.1 E. coli ...
## [2] CATTATCCGAACCATCCGCTGTGGTACACGCTGTGCGACCGCTACGGCCTGTAT lacZ
## Con ...................................................... Consensus
##
## aln (1243..1296) names
## [1] GTGGTGGATGAAGCCAATATTGAAACCCACGGCATGGTGCCAATGAATCGTCTG V00296.1 E. coli ...
## [2] GTGGTGGATGAAGCCAATATTGAAACCCACGGCATGGTGCCAATGAATCGTCTG lacZ
## Con ...................................................... Consensus
##
## aln (1297..1350) names
## [1] ACCGATGATCCGCGCTGGCTACCGGCGATGAGCGAACGCGTAACGCGAATGGTG V00296.1 E. coli ...
## [2] ACCGATGATCCGCGCTGGCTACCGGCGATGAGCGAACGCGTAACGCGAATGGTG lacZ
## Con ...................................................... Consensus
##
## aln (1351..1404) names
## [1] CAGCGCGATCGTAATCACCCGAGTGTGATCATCTGGTCGCTGGGGAATGAATCA V00296.1 E. coli ...
## [2] CAGCGCGATCGTAATCACCCGAGTGTGATCATCTGGTCGCTGGGGAATGAATCA lacZ
## Con ...................................................... Consensus
##
## aln (1405..1458) names
## [1] GGCCACGGCGCTAATCACGACGCGCTGTATCGCTGGATCAAATCTGTCGATCCT V00296.1 E. coli ...
## [2] GGCCACGGCGCTAATCACGACGCGCTGTATCGCTGGATCAAATCTGTCGATCCT lacZ
## Con ...................................................... Consensus
##
## aln (1459..1512) names
## [1] TCCCGCCCGGTGCAGTATGAAGGCGGCGGAGCCGACACCACGGCCACCGATATT V00296.1 E. coli ...
## [2] TCCCGCCCGGTGCAGTATGAAGGCGGCGGAGCCGACACCACGGCCACCGATATT lacZ
## Con ...................................................... Consensus
##
## aln (1513..1566) names
## [1] ATTTGCCCGATGTACGCGCGCGTGGATGAAGACCAGCCCTTCCCGGCTGTGCCG V00296.1 E. coli ...
## [2] ATTTGCCCGATGTACGCGCGCGTGGATGAAGACCAGCCCTTCCCGGCTGTGCCG lacZ
## Con ...................................................... Consensus
##
## aln (1567..1620) names
## [1] AAATGGTCCATCAAAAAATGGCTTTCGCTACCTGGAGAGACGCGCCCGCTGATC V00296.1 E. coli ...
## [2] AAATGGTCCATCAAAAAATGGCTTTCGCTACCTGGAGAGACGCGCCCGCTGATC lacZ
## Con ...................................................... Consensus
##
## aln (1621..1674) names
## [1] CTTTGCGAATACGCCCACGCGATGGGTAACAGTCTTGGCGGTTTCGCTAAATAC V00296.1 E. coli ...
## [2] CTTTGCGAATACGCCCACGCGATGGGTAACAGTCTTGGCGGTTTCGCTAAATAC lacZ
## Con ...................................................... Consensus
##
## aln (1675..1728) names
## [1] TGGCAGGCGTTTCGTCAGTATCCCCGTTTACAGGGCGGCTTCGTCTGGGACTGG V00296.1 E. coli ...
## [2] TGGCAGGCGTTTCGTCAGTATCCCCGTTTACAGGGCGGCTTCGTCTGGGACTGG lacZ
## Con ...................................................... Consensus
##
## aln (1729..1782) names
## [1] GTGGATCAGTCGCTGATTAAATATGATGAAAACGGCAACCCGTGGTCGGCTTAC V00296.1 E. coli ...
## [2] GTGGATCAGTCGCTGATTAAATATGATGAAAACGGCAACCCGTGGTCGGCTTAC lacZ
## Con ...................................................... Consensus
##
## aln (1783..1836) names
## [1] GGCGGTGATTTTGGCGATACGCCGAACGATCGCCAGTTCTGTATGAACGGTCTG V00296.1 E. coli ...
## [2] GGCGGTGATTTTGGCGATACGCCGAACGATCGCCAGTTCTGTATGAACGGTCTG lacZ
## Con ...................................................... Consensus
##
## aln (1837..1890) names
## [1] GTCTTTGCCGACCGCACGCCGCATCCAGCGCTGACGGAAGCAAAACACCAGCAG V00296.1 E. coli ...
## [2] GTCTTTGCCGACCGCACGCCGCATCCAGCGCTGACGGAAGCAAAACACCAGCAG lacZ
## Con ...................................................... Consensus
##
## aln (1891..1944) names
## [1] CAGTTTTTCCAGTTCCGTTTATCCGGGCAAACCATCGAAGTGACCAGCGAATAC V00296.1 E. coli ...
## [2] CAGTTTTTCCAGTTCCGTTTATCCGGGCAAACCATCGAAGTGACCAGCGAATAC lacZ
## Con ...................................................... Consensus
##
## aln (1945..1998) names
## [1] CTGTTCCGTCATAGCGATAACGAGCTCCTGCACTGGATGGTGGCGCTGGATGGT V00296.1 E. coli ...
## [2] CTGTTCCGTCATAGCGATAACGAGCTCCTGCACTGGATGGTGGCGCTGGATGGT lacZ
## Con ...................................................... Consensus
##
## aln (1999..2052) names
## [1] AAGCCGCTGGCAAGCGGTGAAGTGCCTCTGGATGTCGCTCCACAAGGTAAACAG V00296.1 E. coli ...
## [2] AAGCCGCTGGCAAGCGGTGAAGTGCCTCTGGATGTCGCTCCACAAGGTAAACAG lacZ
## Con ...................................................... Consensus
##
## aln (2053..2106) names
## [1] TTGATTGAACTGCCTGAACTACCGCAGCCGGAGAGCGCCGGGCAACTCTGGCTC V00296.1 E. coli ...
## [2] TTGATTGAACTGCCTGAACTACCGCAGCCGGAGAGCGCCGGGCAACTCTGGCTC lacZ
## Con ...................................................... Consensus
##
## aln (2107..2160) names
## [1] ACAGTACGCGTAGTGCAACCGAACGCGACCGCATGGTCAGAAGCCGGGCACATC V00296.1 E. coli ...
## [2] ACAGTACGCGTAGTGCAACCGAACGCGACCGCATGGTCAGAAGCCGGGCACATC lacZ
## Con ...................................................... Consensus
##
## aln (2161..2214) names
## [1] AGCGCCTGGCAGCAGTGGCGTCTGGCGGAAAACCTCAGTGTGACGCTCCCCGCC V00296.1 E. coli ...
## [2] AGCGCCTGGCAGCAGTGGCGTCTGGCGGAAAACCTCAGTGTGACGCTCCCCGCC lacZ
## Con ...................................................... Consensus
##
## aln (2215..2268) names
## [1] GCGTCCCACGCCATCCCGCATCTGACCACCAGCGAAATGGATTTTTGCATCGAG V00296.1 E. coli ...
## [2] GCGTCCCACGCCATCCCGCATCTGACCACCAGCGAAATGGATTTTTGCATCGAG lacZ
## Con ...................................................... Consensus
##
## aln (2269..2322) names
## [1] CTGGGTAATAAGCGTTGGCAATTTAACCGCCAGTCAGGCTTTCTTTCACAGATG V00296.1 E. coli ...
## [2] CTGGGTAATAAGCGTTGGCAATTTAACCGCCAGTCAGGCTTTCTTTCACAGATG lacZ
## Con ...................................................... Consensus
##
## aln (2323..2376) names
## [1] TGGATTGGCGATAAAAAACAACTGCTGACGCCGCTGCGCGATCAGTTCACCCGT V00296.1 E. coli ...
## [2] TGGATTGGCGATAAAAAACAACTGCTGACGCCGCTGCGCGATCAGTTCACCCGT lacZ
## Con ...................................................... Consensus
##
## aln (2377..2430) names
## [1] GCACCGCTGGATAACGACATTGGCGTAAGTGAAGCGACCCGCATTGACCCTAAC V00296.1 E. coli ...
## [2] GCACCGCTGGATAACGACATTGGCGTAAGTGAAGCGACCCGCATTGACCCTAAC lacZ
## Con ...................................................... Consensus
##
## aln (2431..2484) names
## [1] GCCTGGGTCGAACGCTGGAAGGCGGCGGGCCATTACCAGGCCGAAGCAGCGTTG V00296.1 E. coli ...
## [2] GCCTGGGTCGAACGCTGGAAGGCGGCGGGCCATTACCAGGCCGAAGCAGCGTTG lacZ
## Con ...................................................... Consensus
##
## aln (2485..2538) names
## [1] TTGCAGTGCACGGCAGATACACTTGCTGATGCGGTGCTGATTACGACCGCTCAC V00296.1 E. coli ...
## [2] TTGCAGTGCACGGCAGATACACTTGCTGATGCGGTGCTGATTACGACCGCTCAC lacZ
## Con ...................................................... Consensus
##
## aln (2539..2592) names
## [1] GCGTGGCAGCATCAGGGGAAAACCTTATTTATCAGCCGGAAAACCTACCGGATT V00296.1 E. coli ...
## [2] GCGTGGCAGCATCAGGGGAAAACCTTATTTATCAGCCGGAAAACCTACCGGATT lacZ
## Con ...................................................... Consensus
##
## aln (2593..2646) names
## [1] GATGGTAGTGGTCAAATGGCGATTACCGTTGATGTTGAAGTGGCGAGCGATACA V00296.1 E. coli ...
## [2] GATGGTAGTGGTCAAATGGCGATTACCGTTGATGTTGAAGTGGCGAGCGATACA lacZ
## Con ...................................................... Consensus
##
## aln (2647..2700) names
## [1] CCGCATCCGGCGCGGATTGGCCTGAACTGCCAGCTGGCGCAGGTAGCAGAGCGG V00296.1 E. coli ...
## [2] CCGCATCCGGCGCGGATTGGCCTGAACTGCCAGCTGGCGCAGGTAGCAGAGCGG lacZ
## Con ...................................................... Consensus
##
## aln (2701..2754) names
## [1] GTAAACTGGCTCGGATTAGGGCCGCAAGAAAACTATCCCGACCGCCTTACTGCC V00296.1 E. coli ...
## [2] GTAAACTGGCTCGGATTAGGGCCGCAAGAAAACTATCCCGACCGCCTTACTGCC lacZ
## Con ...................................................... Consensus
##
## aln (2755..2808) names
## [1] GCCTGTTTTGACCGCTGGGATCTGCCATTGTCAGACATGTATACCCCGTACGTC V00296.1 E. coli ...
## [2] GCCTGTTTTGACCGCTGGGATCTGCCATTGTCAGACATGTATACCCCGTACGTC lacZ
## Con ...................................................... Consensus
##
## aln (2809..2862) names
## [1] TTCCCGAGCGAAAACGGTCTGCGCTGCGGGACGCGCGAATTGAATTATGGCCCA V00296.1 E. coli ...
## [2] TTCCCGAGCGAAAACGGTCTGCGCTGCGGGACGCGCGAATTGAATTATGGCCCA lacZ
## Con ...................................................... Consensus
##
## aln (2863..2916) names
## [1] CACCAGTGGCGCGGCGACTTCCAGTTCAACATCAGCCGCTACAGTCAACAGCAA V00296.1 E. coli ...
## [2] CACCAGTGGCGCGGCGACTTCCAGTTCAACATCAGCCGCTACAGTCAACAGCAA lacZ
## Con ...................................................... Consensus
##
## aln (2917..2970) names
## [1] CTGATGGAAACCAGCCATCGCCATCTGCTGCACGCGGAAGAAGGCACATGGCTG V00296.1 E. coli ...
## [2] CTGATGGAAACCAGCCATCGCCATCTGCTGCACGCGGAAGAAGGCACATGGCTG lacZ
## Con ...................................................... Consensus
##
## aln (2971..3024) names
## [1] AATATCGACGGTTTCCATATGGGGATTGGTGGCGACGACTCCTGGAGCCCGTCA V00296.1 E. coli ...
## [2] AATATCGACGGTTTCCATATGGGGATTGGTGGCGACGACTCCTGGAGCCCGTCA lacZ
## Con ...................................................... Consensus
##
## aln (3025..3078) names
## [1] GTATCGGCGGAATTCCAGCTGAGCGCCGGTCGCTACCATTACCAGTTGGTCTGG V00296.1 E. coli ...
## [2] GTATCGGCGGAATTACAGCTGAGCGCCGGTCGCTACCATTACCAGTTGGTCTGG lacZ
## Con ...................................................... Consensus
##
## aln (3079..3096) names
## [1] TGTCAAAAATAATAATAA V00296.1 E. coli ...
## [2] TGTCAAAAATAATAATAA lacZ
## Con .................. Consensus
# help("print,MsaDNAMultipleAlignment-method")
# msa::msaPrettyPrint(alignment, output = "asis")
positions <- snvs_ins_del_clean %>%
dplyr::arrange(Position) %>%
pull(Position)
positions_other_studies <- snvs_ins_del_clean %>%
dplyr::filter(!Study == "This Study") %>%
dplyr::arrange(Position) %>%
pull(Position)
# Where is ref C or G?
snvs %>%
group_by(Ref) %>%
tally()
## # A tibble: 4 × 2
## Ref n
## <chr> <int>
## 1 A 236
## 2 C 1402
## 3 G 1749
## 4 T 361
snvs %>%
group_by(Ref) %>%
dplyr::mutate(purine_pyrimidine = ifelse(Ref %in% c("C", "G"), yes = "C/G", no = "A/T")) %>%
tally()
## # A tibble: 4 × 2
## Ref n
## <chr> <int>
## 1 A 236
## 2 C 1402
## 3 G 1749
## 4 T 361
snvs %>%
group_by(Ref) %>%
dplyr::mutate(purine_pyrimidine = ifelse(Ref %in% c("C", "G"), yes = "C/G", no = "A/T")) %>%
dplyr::group_by(purine_pyrimidine) %>%
tally() %>%
dplyr::mutate(freq = round(n / sum(n), 3))
## # A tibble: 2 × 3
## purine_pyrimidine n freq
## <chr> <int> <dbl>
## 1 A/T 597 0.159
## 2 C/G 3151 0.841
# Break down by compound
snvs_genvisr <- snvs %>%
dplyr::mutate(sample = Exposure, reference = Ref, variant = Alt) %>%
dplyr::select(sample, reference, variant)
GenVisR::TvTi(snvs_genvisr, fileType = "MGI", progress = F)
## NULL
GenVisR::TvTi(snvs_genvisr, fileType = "MGI", out = "data", progress = F)
## $main
## trans_tranv sample Freq Prop
## 1 A->C or T->G (TV) BaP 17 0.01305684
## 2 A->G or T->C (TI) BaP 12 0.00921659
## 3 A->T or T->A (TV) BaP 46 0.03533026
## 4 G->A or C->T (TI) BaP 242 0.18586790
## 5 G->C or C->G (TV) BaP 256 0.19662058
## 6 G->T or C->A (TV) BaP 729 0.55990783
## 7 A->C or T->G (TV) BaP-IU 31 0.04372355
## 8 A->G or T->C (TI) BaP-IU 14 0.01974612
## 9 A->T or T->A (TV) BaP-IU 34 0.04795487
## 10 G->A or C->T (TI) BaP-IU 137 0.19322990
## 11 G->C or C->G (TV) BaP-IU 137 0.19322990
## 12 G->T or C->A (TV) BaP-IU 356 0.50211566
## 13 A->C or T->G (TV) Control 28 0.03641092
## 14 A->G or T->C (TI) Control 32 0.04161248
## 15 A->T or T->A (TV) Control 40 0.05201560
## 16 G->A or C->T (TI) Control 409 0.53185956
## 17 G->C or C->G (TV) Control 55 0.07152146
## 18 G->T or C->A (TV) Control 205 0.26657997
## 25 A->C or T->G (TV) ENU 29 0.04991394
## 26 A->G or T->C (TI) ENU 44 0.07573150
## 27 A->T or T->A (TV) ENU 160 0.27538726
## 28 G->A or C->T (TI) ENU 199 0.34251291
## 29 G->C or C->G (TV) ENU 7 0.01204819
## 30 G->T or C->A (TV) ENU 142 0.24440620
## 31 A->C or T->G (TV) PRC 6 0.03208556
## 32 A->G or T->C (TI) PRC 21 0.11229947
## 33 A->T or T->A (TV) PRC 37 0.19786096
## 34 G->A or C->T (TI) PRC 78 0.41711230
## 35 G->C or C->G (TV) PRC 5 0.02673797
## 36 G->T or C->A (TV) PRC 40 0.21390374
## 37 A->C or T->G (TV) TEM 13 0.06500000
## 38 A->G or T->C (TI) TEM 13 0.06500000
## 39 A->T or T->A (TV) TEM 20 0.10000000
## 40 G->A or C->T (TI) TEM 87 0.43500000
## 41 G->C or C->G (TV) TEM 12 0.06000000
## 42 G->T or C->A (TV) TEM 55 0.27500000
##
## $expect
## NULL
# All together
snvs_genvisr <- snvs %>%
dplyr::mutate(sample = "All lacZ Mutations", reference = Ref, variant = Alt) %>%
dplyr::select(sample, reference, variant)
GenVisR::TvTi(snvs_genvisr, fileType = "MGI", progress = F)
## NULL
tv_ti_table <- GenVisR::TvTi(snvs_genvisr, fileType = "MGI", out = "data", progress = F)
tv_ti_table
## $main
## trans_tranv sample Freq Prop
## 1 A->C or T->G (TV) All lacZ Mutations 124 0.03308431
## 2 A->G or T->C (TI) All lacZ Mutations 136 0.03628602
## 3 A->T or T->A (TV) All lacZ Mutations 337 0.08991462
## 4 G->A or C->T (TI) All lacZ Mutations 1152 0.30736393
## 5 G->C or C->G (TV) All lacZ Mutations 472 0.12593383
## 6 G->T or C->A (TV) All lacZ Mutations 1527 0.40741729
##
## $expect
## NULL
tv_ti_ratio <- tv_ti_table$main %>% dplyr::mutate(Class = str_extract(string = trans_tranv, pattern = regex("\\(\\w++\\)")))
tv_ti_ratio %>%
dplyr::group_by(Class) %>%
dplyr::mutate(class_count = sum(Freq))
## # A tibble: 6 × 6
## # Groups: Class [2]
## trans_tranv sample Freq Prop Class class_count
## <fct> <fct> <int> <dbl> <chr> <int>
## 1 A->C or T->G (TV) All lacZ Mutations 124 0.0331 (TV) 2460
## 2 A->G or T->C (TI) All lacZ Mutations 136 0.0363 (TI) 1288
## 3 A->T or T->A (TV) All lacZ Mutations 337 0.0899 (TV) 2460
## 4 G->A or C->T (TI) All lacZ Mutations 1152 0.307 (TI) 1288
## 5 G->C or C->G (TV) All lacZ Mutations 472 0.126 (TV) 2460
## 6 G->T or C->A (TV) All lacZ Mutations 1527 0.407 (TV) 2460
################################################################################
# lacZ Sequence Visualization
################################################################################
lacZ_vis <- SequenceTrack(lacz_mutamouse, ucscChromosomeNames = FALSE)
# plotTracks(lacZ_vis, chromosome = "lacZ", from = 10, to = 40)
snvs_ins_del_clean$chromosome <- "lacZ"
lacZ_granges <- makeGRangesFromDataFrame(snvs_ins_del_clean,
keep.extra.columns = T,
ignore.strand = T,
seqinfo = NULL,
seqnames.field = "chromosome",
start.field = "Position",
end.field = "Position",
starts.in.df.are.0based = FALSE
)
lacZ_annotation <- AnnotationTrack(lacZ_granges, name = "lacZ data")
# plotTracks(list(lacZ_annotation, lacZ_vis), from=2200, to=2375)
# Some parameters to use in other tracks...
roundup <- function(x) {
round(x + 5, -1)
}
# Set a y limit to be reused across all tracks
overall_ylim_nt <- c(0, roundup(snvs_ins_del_clean %>%
group_by(Position) %>%
tally() %>% pull(n) %>% max()))
# overall_ylim = NULL
# Value to scale axis text
axis_scale <- 0.5
dat_all <- snvs_ins_del_clean %>%
group_by(Position) %>%
tally()
all_mutations <- DataTrack(
data = dat_all$n,
start = dat_all$Position,
end = dat_all$Position,
chromosome = "lacZ",
genome = "lacZ",
name = "All Types",
type = "h",
ylim = overall_ylim_nt,
cex.axis = axis_scale
)
dat_nonsense <- snvs_ins_del_clean %>%
dplyr::filter(Consequence == "nonsense") %>%
group_by(Position) %>%
tally()
nonsense <- DataTrack(
data = dat_nonsense$n,
start = dat_nonsense$Position,
end = dat_nonsense$Position,
chromosome = "lacZ",
genome = "lacZ",
name = "Nonsense",
type = "h",
ylim = overall_ylim_nt,
cex.axis = axis_scale
)
dat_missense <- snvs_ins_del_clean %>%
dplyr::filter(Consequence == "missense") %>%
group_by(Position) %>%
tally()
missense <- DataTrack(
data = dat_missense$n,
start = dat_missense$Position,
end = dat_missense$Position,
chromosome = "lacZ",
genome = "lacZ",
name = "Missense",
type = "h",
ylim = overall_ylim_nt,
cex.axis = axis_scale
)
dat_frameshift <- snvs_ins_del_clean %>%
dplyr::filter(Consequence == "frameshift") %>%
group_by(Position) %>%
tally()
frameshift <- DataTrack(
data = dat_frameshift$n,
start = dat_frameshift$Position,
end = dat_frameshift$Position,
chromosome = "lacZ",
genome = "lacZ",
name = "Frameshift",
type = "h",
ylim = overall_ylim_nt,
cex.axis = axis_scale
)
dat_insertions <- snvs_ins_del_clean %>%
dplyr::filter(Type == "Insertion") %>%
group_by(Position) %>%
tally()
insertions <- DataTrack(
data = dat_insertions$n,
start = dat_insertions$Position,
end = dat_insertions$Position,
chromosome = "lacZ",
genome = "lacZ",
name = "Insertions",
type = "h",
ylim = overall_ylim_nt,
cex.axis = axis_scale
)
dat_deletions <- snvs_ins_del_clean %>%
dplyr::filter(Type == "Deletion") %>%
group_by(Position) %>%
tally()
deletions <- DataTrack(
data = dat_deletions$n,
start = dat_deletions$Position,
end = dat_deletions$Position,
chromosome = "lacZ",
genome = "lacZ",
name = "Deletions",
type = "h",
ylim = overall_ylim_nt,
cex.axis = axis_scale
)
# plotTracks(list(lacZ_vis, all_mutations), type="horizon")
domains_feature_fill <- "slategray4"
domains_nt <- AnnotationTrack(
start = c(49 * 3, 221 * 3, 336 * 3, 749 * 3),
end = c(219 * 3, 334 * 3, 630 * 3, 1022 * 3),
chromosome = "lacZ",
fill = domains_feature_fill,
id = c(
"Sugar Binding",
"β-Galactosidase",
"TIM Barrel",
"β-Galactosidase Small Chain"
),
genome = "lacZ", name = "Domains"
)
domain_ht_nt <- HighlightTrack(
trackList = list(
all_mutations,
missense,
nonsense,
frameshift,
insertions,
deletions
),
start = c(49 * 3, 221 * 3, 336 * 3, 749 * 3),
end = c(219 * 3, 334 * 3, 630 * 3, 1022 * 3),
chromosome = "lacZ",
fill = c(
"#FFE5E5", # rgb(255, 229, 229)
"#F4FAED", # rgb(244, 250, 237),
"#F0EAF5", # rgb(240, 234, 245),
"#FCF0E6"
), # rgb(252, 240, 230)), #"snow3","snow2"
col = "#000000FF", # rgb(0,0,0, alpha=1)
lwd = 0.2,
inBackground = T
)
# hs <- as.numeric(levels(ranked_hotspots[["Position"]]))
# hotspots_range <- GRanges(seqnames = "lacZ",
# ranges = IRanges(start = hs,
# end = hs))
#
# deTrack <- AnnotationTrack(range = hotspots_range,
# genome = "lacZ",
# chromosome = "lacZ",
# name = "Hotspots",
# stacking = "squish")
gtrack <- GenomeAxisTrack(
add53 = TRUE,
littleTicks = TRUE,
name = "lacZ gene",
showId = T
)
displayPars(domains_nt) <- list(size = 5)
plotTracks(
list( # lacZ_vis,deTrack,
domains_nt,
domain_ht_nt,
gtrack
),
featureAnnotation = "id",
fontcolor.feature = "white",
background.title = "slategrey",
background.panel = "transparent",
fontsize.feature = 7,
stackHeight = 1
)
dat_nt_ngs <- snvs_ins_del_clean %>%
dplyr::filter(Technology == "NGS") %>%
group_by(Position) %>%
tally()
ngs_nt <- DataTrack(
data = dat_nt_ngs$n,
start = dat_nt_ngs$Position,
end = dat_nt_ngs$Position,
chromosome = "lacZ",
genome = "lacZ",
name = "NGS",
type = "h",
ylim = overall_ylim_nt,
cex.axis = axis_scale
)
dat_nt_sanger <- snvs_ins_del_clean %>%
dplyr::filter(Technology == "Sanger") %>%
group_by(Position) %>%
tally()
sanger_nt <- DataTrack(
data = dat_nt_sanger$n,
start = dat_nt_sanger$Position,
end = dat_nt_sanger$Position,
chromosome = "lacZ",
genome = "lacZ",
name = "Sanger",
type = "h",
ylim = overall_ylim_nt,
cex.axis = axis_scale
)
domain_ht_nt_tech <- HighlightTrack(
trackList = list(
all_mutations,
ngs_nt,
sanger_nt
),
start = c(49 * 3, 221 * 3, 336 * 3, 749 * 3),
end = c(219 * 3, 334 * 3, 630 * 3, 1022 * 3),
chromosome = "lacZ",
fill = c(
"#FFE5E5", # rgb(255, 229, 229, max = 255)
"#F4FAED", # rgb(244, 250, 237, max = 255),
"#F0EAF5", # rgb(240, 234, 245, max = 255),
"#FCF0E6"
), # rgb(252, 240, 230, max = 255)), #"snow3","snow2"
col = "#000000FF", # rgb(0,0,0, alpha=1)
lwd = 0.2,
inBackground = T
)
plotTracks(
list( # lacZ_vis,deTrack,
domains_nt,
domain_ht_nt_tech,
gtrack
),
featureAnnotation = "id",
fontcolor.feature = "white",
background.title = "slategrey", # brown?
background.panel = "transparent",
fontsize.feature = 7,
stackHeight = 1
)
################################################################################
# Beta-Gal Protein Visualization
################################################################################
lacZ_vis_aa <- ProteinSequenceTrack(laczref_aa,
labelPos = "below",
chromosome = "lacZ",
name = "β-Gal",
cex = 0.5,
range = IRanges(
start = c(49, 221, 336, 749),
end = c(219, 334, 630, 1022),
names = c(
"Sugar Binding",
"β-Galactosidase",
"TIM Barrel",
"β-Galactosidase\nSmall Chain"
)
)
)
paxTrack <- ProteinAxisTrack(
littleTicks = TRUE,
addNC = F
)
# Sugar Binding (49-219; PF02837)
# β-Galactosidase (221-334; PF00703)
# TIM Barrel (336-630; PF02836)
# β-Galactosidase Small Chain (749-1022; PF02929)
domains <- AnnotationTrack(
start = c(49, 221, 336, 749),
end = c(219, 334, 630, 1022),
chromosome = "lacZ",
name = "β-Gal",
fill = domains_feature_fill,
id = c(
"Sugar Binding",
"β-Galactosidase",
"TIM Barrel",
"β-Galactosidase Small Chain"
),
genome = "lacZ"
)
# Set a y limit to be reused across all tracks
overall_ylim_codon <- c(0, snvs_ins_del_clean %>%
group_by(Codon) %>%
tally() %>%
drop_na() %>% pull(n) %>% max())
# overall_ylim = NULL
dat_aa <- snvs_ins_del_clean %>%
dplyr::filter(Consequence == "missense") %>%
group_by(Codon) %>%
tally() %>%
drop_na()
all_codon_mutations <- DataTrack(
data = dat_aa$n,
start = dat_aa$Codon,
end = dat_aa$Codon,
chromosome = "lacZ",
genome = "lacZ",
name = "Missense Mutations",
type = "h",
ylim = overall_ylim_codon
)
dat_aa_sanger <- snvs_ins_del_clean %>%
dplyr::filter(Consequence == "missense") %>%
filter(Technology == "Sanger") %>%
group_by(Codon) %>%
tally() %>%
drop_na()
codon_mutations_sanger <- DataTrack(
data = dat_aa_sanger$n,
start = dat_aa_sanger$Codon,
end = dat_aa_sanger$Codon,
chromosome = "lacZ",
genome = "lacZ",
name = "Sanger",
type = "h",
ylim = overall_ylim_codon
)
dat_aa_ngs <- snvs_ins_del_clean %>%
dplyr::filter(Consequence == "missense") %>%
filter(Technology == "NGS") %>%
group_by(Codon) %>%
tally() %>%
drop_na()
codon_mutations_ngs <- DataTrack(
data = dat_aa_ngs$n,
start = dat_aa_ngs$Codon,
end = dat_aa_ngs$Codon,
chromosome = "lacZ",
genome = "lacZ",
name = "NGS",
type = "h",
ylim = overall_ylim_codon
)
plotTracks(list(lacZ_vis_aa, domains, all_codon_mutations), from = 1, to = 100)
plotTracks(list(lacZ_vis_aa, domains, all_codon_mutations), from = 1, to = 500)
domain_ht <- HighlightTrack(
trackList = list(
all_codon_mutations,
codon_mutations_ngs,
codon_mutations_sanger
),
start = c(49, 221, 336, 749),
end = c(219, 334, 630, 1022),
chromosome = "lacZ",
fill = c(
"#FFE5E5", # rgb(255, 229, 229)
"#F4FAED", # rgb(244, 250, 237),
"#F0EAF5", # rgb(240, 234, 245),
"#FCF0E6"
), # rgb(252, 240, 230)), #"snow3","snow2"
col = "#000000FF", # rgb(0,0,0, alpha=1)
lwd = 0.2,
inBackground = T
)
plotTracks(
list( # lacZ_vis_aa,
domains,
domain_ht,
paxTrack
),
featureAnnotation = "id",
fontcolor.feature = "white",
background.title = "slategrey", # brown?
background.panel = "transparent",
fontsize.feature = 7,
stackHeight = 1
)
# featureAnnotation = "id",
# fontcolor.feature = "darkblue",
# background.title = "brown",
# background.panel = "transparent",
# fontsize.feature = 7)
dat_nonsense_aa <- snvs_ins_del_clean %>%
dplyr::filter(Consequence == "nonsense") %>%
group_by(Codon) %>%
tally() %>%
drop_na()
nonsense_aa <- DataTrack(
data = dat_nonsense_aa$n,
start = dat_nonsense_aa$Codon,
end = dat_nonsense_aa$Codon,
chromosome = "lacZ",
genome = "lacZ",
name = "Nonsense",
type = "h",
ylim = overall_ylim_codon,
cex.axis = axis_scale
)
dat_missense_aa <- snvs_ins_del_clean %>%
dplyr::filter(Consequence == "missense") %>%
group_by(Codon) %>%
tally() %>%
drop_na()
missense_aa <- DataTrack(
data = dat_missense_aa$n,
start = dat_missense_aa$Codon,
end = dat_missense_aa$Codon,
chromosome = "lacZ",
genome = "lacZ",
name = "Missense",
type = "h",
ylim = overall_ylim_codon,
cex.axis = axis_scale
)
dat_frameshift_aa <- snvs_ins_del_clean %>%
dplyr::filter(Consequence == "frameshift") %>%
group_by(Codon) %>%
tally() %>%
drop_na()
frameshift_aa <- DataTrack(
data = dat_frameshift_aa$n,
start = dat_frameshift_aa$Codon,
end = dat_frameshift_aa$Codon,
chromosome = "lacZ",
genome = "lacZ",
name = "Frameshift",
type = "h",
ylim = overall_ylim_codon,
cex.axis = axis_scale
)
dat_insertions_aa <- snvs_ins_del_clean %>%
dplyr::filter(Type == "Insertion") %>%
group_by(Codon) %>%
tally() %>%
drop_na()
insertions_aa <- DataTrack(
data = dat_insertions_aa$n,
start = dat_insertions_aa$Codon,
end = dat_insertions_aa$Codon,
chromosome = "lacZ",
genome = "lacZ",
name = "Insertions",
type = "h",
ylim = overall_ylim_codon,
cex.axis = axis_scale
)
dat_deletions_aa <- snvs_ins_del_clean %>%
dplyr::filter(Type == "Deletion") %>%
group_by(Codon) %>%
tally() %>%
drop_na()
deletions_aa <- DataTrack(
data = dat_deletions_aa$n,
start = dat_deletions_aa$Codon,
end = dat_deletions_aa$Codon,
chromosome = "lacZ",
genome = "lacZ",
name = "Deletions",
type = "h",
ylim = overall_ylim_codon,
cex.axis = axis_scale
)
domain_ht_aa_type <- HighlightTrack(
trackList = list( # frameshift_aa,
# insertions_aa,
# deletions_aa,
all_codon_mutations,
missense_aa,
nonsense_aa
),
start = c(49, 221, 336, 749),
end = c(219, 334, 630, 1022),
chromosome = "lacZ",
fill = c(
"#FFE5E5", # rgb(255, 229, 229)
"#F4FAED", # rgb(244, 250, 237),
"#F0EAF5", # rgb(240, 234, 245),
"#FCF0E6"
), # rgb(252, 240, 230)), #"snow3","snow2"
col = "#000000FF", # rgb(0,0,0, alpha=1)
lwd = 0.2,
inBackground = T
)
plotTracks(
list(
domains,
domain_ht_aa_type,
paxTrack
),
featureAnnotation = "id",
fontcolor.feature = "white",
background.title = "slategrey", # brown?
background.panel = "transparent",
fontsize.feature = 7,
stackHeight = 1
)
# Amino acids, spontaneous mutations only
dat_spontaneous <- snvs_ins_del_clean %>%
dplyr::filter(Exposure == "Control") %>%
group_by(Codon) %>%
tally() %>%
drop_na()
# 378 codons from spontaneous (controls)
spontaneous_aa <- DataTrack(
data = dat_spontaneous$n,
start = dat_spontaneous$Codon,
end = dat_spontaneous$Codon,
chromosome = "lacZ",
genome = "lacZ",
name = "Spontaneous Mutations",
type = "h",
ylim = c(0, max(dat_spontaneous$n)),
cex.axis = axis_scale
)
domain_ht_aa_spontaneous <- HighlightTrack(
trackList = list(
spontaneous_aa
),
start = c(49, 221, 336, 749),
end = c(219, 334, 630, 1022),
chromosome = "lacZ",
fill = c(
"#FFE5E5", # rgb(255, 229, 229)
"#F4FAED", # rgb(244, 250, 237),
"#F0EAF5", # rgb(240, 234, 245),
"#FCF0E6"
), # rgb(252, 240, 230)), #"snow3","snow2"
col = "#000000FF", # rgb(0,0,0, alpha=1)
lwd = 0.2,
inBackground = T
)
plotTracks(
list(
domains,
domain_ht_aa_spontaneous,
paxTrack
),
featureAnnotation = "id",
fontcolor.feature = "white",
background.title = "slategrey", # brown?
background.panel = "transparent",
fontsize.feature = 7,
stackHeight = 1
)
dat_induced <- snvs_ins_del_clean %>%
dplyr::filter(!Exposure == "Control") %>%
group_by(Codon) %>%
tally() %>%
drop_na()
# 378 codons from spontaneous (controls)
induced_aa <- DataTrack(
data = dat_induced$n,
start = dat_induced$Codon,
end = dat_induced$Codon,
chromosome = "lacZ",
genome = "lacZ",
name = "Induced Mutations",
type = "h",
ylim = c(0, max(dat_induced$n)),
cex.axis = axis_scale
)
domain_ht_aa_induced <- HighlightTrack(
trackList = list(
induced_aa
),
start = c(49, 221, 336, 749),
end = c(219, 334, 630, 1022),
chromosome = "lacZ",
fill = c(
"#FFE5E5", # rgb(255, 229, 229)
"#F4FAED", # rgb(244, 250, 237),
"#F0EAF5", # rgb(240, 234, 245),
"#FCF0E6"
), # rgb(252, 240, 230)), #"snow3","snow2"
col = "#000000FF", # rgb(0,0,0, alpha=1)
lwd = 0.2,
inBackground = T
)
plotTracks(
list(
domains,
domain_ht_aa_induced,
paxTrack
),
featureAnnotation = "id",
fontcolor.feature = "white",
background.title = "slategrey", # brown?
background.panel = "transparent",
fontsize.feature = 7,
stackHeight = 1
)
dat_silent <- snvs_ins_del_clean %>%
dplyr::filter(Consequence == "silent") %>%
group_by(PositionRef) %>%
tally() %>%
drop_na()
# 378 codons from spontaneous (controls)
silent_aa <- DataTrack(
data = dat_silent$n,
start = dat_silent$PositionRef,
end = dat_silent$PositionRef,
chromosome = "lacZ",
genome = "lacZ",
name = "Silent Mutations",
type = "h",
ylim = c(0, max(dat_silent$n)),
cex.axis = axis_scale
)
domain_ht_aa_silent <- HighlightTrack(
trackList = list(
silent_aa
),
start = c(49 * 3, 221 * 3, 336 * 3, 749 * 3),
end = c(219 * 3, 334 * 3, 630 * 3, 1022 * 3),
chromosome = "lacZ",
fill = c(
"#FFE5E5", # rgb(255, 229, 229)
"#F4FAED", # rgb(244, 250, 237),
"#F0EAF5", # rgb(240, 234, 245),
"#FCF0E6"
), # rgb(252, 240, 230)), #"snow3","snow2"
col = "#000000FF", # rgb(0,0,0, alpha=1)
lwd = 0.2,
inBackground = T
)
plotTracks(
list(
domains_nt,
domain_ht_aa_silent,
gtrack
),
featureAnnotation = "id",
fontcolor.feature = "white",
background.title = "slategrey", # brown?
background.panel = "transparent",
fontsize.feature = 7,
stackHeight = 1
)
# Consider reference base and whether there is a functional change
occurrences <- snvs_ins_del_clean %>%
group_by(Type, Ref, Alt) %>%
mutate(FunctionalChangePercent = 100 * (sum(FunctionalChange) / length(FunctionalChange))) %>%
group_by(Position, Type, Ref, Alt, FunctionalChangePercent) %>%
tally()
ggplot(occurrences %>% filter(Type == "SNV"), aes(x = factor(n))) +
geom_histogram(stat = "count") +
facet_wrap(~Ref) +
theme(axis.text.x = element_text(size = 5)) +
ggtitle("Number of times mutation observed, by reference base")
ggplot(occurrences %>% filter(Type == "SNV"), aes(x = factor(n), fill = Ref)) +
geom_histogram(stat = "count") +
facet_grid(Alt ~ Ref) +
theme(axis.text.x = element_text(size = 5)) +
ggtitle("Number of times mutation observed, by reference and alternate base")
occurrences_ratio <- snvs_ins_del_clean %>%
mutate(FunctionalChange = ifelse(test = `Ref A.A.` == `Alt A.A.`,
yes = FALSE,
no = TRUE
)) %>%
filter(Type == "SNV") %>%
group_by(Ref, Alt, codon_position) %>%
dplyr::count(FunctionalChange) %>%
mutate(ratio = scales::percent(n / sum(n)))
ggplot(occurrences_ratio, aes(x = Ref, y = n, fill = FunctionalChange)) +
geom_bar(stat = "identity", position = "fill") +
geom_text(aes(y = n, label = ratio), position = position_fill(vjust = 0.5)) +
facet_grid(Alt ~ codon_position, scales = "free_x") +
ggtitle("Classification of functional change, by reference and alternate base")
occurrences_ratio_target_only <- snvs_ins_del_clean %>%
mutate(FunctionalChange = ifelse(test = `Ref A.A.` == `Alt A.A.`,
yes = FALSE,
no = TRUE
)) %>%
filter(Type == "SNV") %>%
group_by(Ref, codon_position) %>%
dplyr::count(FunctionalChange) %>%
mutate(ratio = scales::percent(n / sum(n)))
ggplot(occurrences_ratio_target_only, aes(x = Ref, y = n, fill = FunctionalChange)) +
geom_bar(stat = "identity", position = "fill") +
geom_text(aes(y = n, label = ratio), position = position_fill(vjust = 0.5)) +
facet_grid(. ~ codon_position, scales = "free_x") +
ggtitle("Classification of functional change, by reference base, split by position in codon")
occurrences_ratio_target_only_chem <- snvs_ins_del_clean %>%
mutate(FunctionalChange = ifelse(test = `Ref A.A.` == `Alt A.A.`,
yes = FALSE,
no = TRUE
)) %>%
filter(Type == "SNV") %>%
group_by(Ref, codon_position, Exposure) %>%
dplyr::count(FunctionalChange) %>%
mutate(ratio = scales::percent(n / sum(n)))
ggplot(occurrences_ratio_target_only_chem, aes(x = Ref, y = n, fill = FunctionalChange)) +
geom_bar(stat = "identity", position = "fill") +
geom_text(aes(y = n, label = ratio), position = position_fill(vjust = 0.5)) +
facet_grid(Exposure ~ codon_position, scales = "free_x") +
ggtitle("Classification of functional change, by reference base, split by position in codon")
occurrences_ratio %>%
knitr::kable() %>%
kableExtra::scroll_box(height = "480px") %>%
kableExtra::kable_paper()
| Ref | Alt | codon_position | FunctionalChange | n | ratio |
|---|---|---|---|---|---|
| A | C | 1 | TRUE | 17 | 100% |
| A | C | 2 | TRUE | 32 | 100% |
| A | C | 3 | FALSE | 11 | 73% |
| A | C | 3 | TRUE | 4 | 27% |
| A | G | 1 | TRUE | 28 | 100% |
| A | G | 2 | TRUE | 79 | 100% |
| A | G | 3 | FALSE | 7 | 100% |
| A | T | 1 | TRUE | 70 | 100% |
| A | T | 2 | TRUE | 79 | 100% |
| A | T | 3 | FALSE | 3 | 18% |
| A | T | 3 | TRUE | 14 | 82% |
| C | A | 1 | FALSE | 1 | 1% |
| C | A | 1 | TRUE | 113 | 99% |
| C | A | 2 | TRUE | 257 | 100% |
| C | A | 3 | FALSE | 40 | 9% |
| C | A | 3 | TRUE | 401 | 91% |
| C | G | 1 | TRUE | 51 | 100% |
| C | G | 2 | TRUE | 71 | 100% |
| C | G | 3 | FALSE | 10 | 8% |
| C | G | 3 | TRUE | 114 | 92% |
| C | T | 1 | FALSE | 5 | 1% |
| C | T | 1 | TRUE | 668 | 99% |
| C | T | 2 | TRUE | 180 | 100% |
| C | T | 3 | FALSE | 17 | 100% |
| G | A | 1 | TRUE | 458 | 100% |
| G | A | 2 | TRUE | 378 | 100% |
| G | A | 3 | FALSE | 53 | 18% |
| G | A | 3 | TRUE | 236 | 82% |
| G | C | 1 | TRUE | 125 | 100% |
| G | C | 2 | TRUE | 121 | 100% |
| G | C | 3 | FALSE | 17 | 42% |
| G | C | 3 | TRUE | 23 | 57% |
| G | T | 1 | TRUE | 622 | 100% |
| G | T | 2 | TRUE | 233 | 100% |
| G | T | 3 | FALSE | 45 | 47.9% |
| G | T | 3 | TRUE | 49 | 52.1% |
| T | A | 1 | TRUE | 39 | 100% |
| T | A | 2 | TRUE | 114 | 100% |
| T | A | 3 | FALSE | 6 | 5% |
| T | A | 3 | TRUE | 123 | 95% |
| T | C | 1 | TRUE | 96 | 100% |
| T | C | 2 | FALSE | 1 | 3% |
| T | C | 2 | TRUE | 36 | 97% |
| T | C | 3 | FALSE | 7 | 100% |
| T | G | 1 | TRUE | 15 | 100% |
| T | G | 2 | TRUE | 34 | 100% |
| T | G | 3 | FALSE | 4 | 9% |
| T | G | 3 | TRUE | 40 | 91% |
CpGs <- Biostrings::matchPattern(
pattern = "CG",
subject = lacz_mutamouse[[1]]
)
CpGs
## Views on a 3096-letter DNAString subject
## subject: ATGACCATGATTACGGATTCACTGGAATTCCCGG...TTACCAGTTGGTCTGGTGTCAAAAATAATAATAA
## views:
## start end width
## [1] 14 15 2 [CG]
## [2] 32 33 2 [CG]
## [3] 42 43 2 [CG]
## [4] 45 46 2 [CG]
## [5] 55 56 2 [CG]
## ... ... ... ... ...
## [287] 3029 3030 2 [CG]
## [288] 3032 3033 2 [CG]
## [289] 3048 3049 2 [CG]
## [290] 3051 3052 2 [CG]
## [291] 3055 3056 2 [CG]
CpGs %>% length()
## [1] 291
# 291 CpG sites in total in reference sequence
CpG_sites_ranges <- GRanges(
seqnames = "lacZ",
ranges = IRanges(
start = start(ranges(CpGs)),
end = end(ranges(CpGs))
)
)
CpGs_in_data <- plyranges::find_overlaps(lacZ_granges, CpG_sites_ranges)
CpGs_in_data <- as.data.frame(CpGs_in_data) %>% dplyr::filter(Type == "SNV")
calculate_gc <- function(seq) {
n_gc <- stringr::str_count(seq, "G")
n_c <- stringr::str_count(seq, "C")
pct_gc <- ((n_gc + n_c) / stringr::str_length(seq))
return(pct_gc)
}
calculate_gc(as.character(unlist(lacz_mutamouse[[1]])))
## [1] 0.5613695
# How many of the CpG sites are accounted for?
CpGs_in_data %>%
dplyr::group_by(start) %>%
tally() %>%
nrow()
## [1] 276
# 276 of the CpG sites have mutations
existing <- read.table("./data/raw/21_existing_amino_acid_changes.txt",
header = T,
sep = "\t"
)
# Rewrite position as relative to lacZ MutaMouse sequence
# Get index of pre-insertion
index <- existing["nuc_start"] < 26
# Transform by subtracting 3
# There is nothing in this range, but let's do it anyway (accounts for MutaMouse insertion)
existing$ref_nuc_start[index] <- existing$nuc_start[index] - 3
# do the same for nuc end
index <- existing["nuc_end"] < 26
existing$ref_nuc_end[index] <- existing$nuc_end[index] - 3
# Get index post-insertion
index <- existing["nuc_start"] > 40
# Subtract 18
existing$ref_nuc_start[index] <- existing$nuc_start[index] - 18
# do the same for nuc end
index <- existing["nuc_end"] > 40
existing$ref_nuc_end[index] <- existing$nuc_end[index] - 18
existing <- existing %>% tidyr::unite("ref_nuc_range", ref_nuc_start:ref_nuc_end, sep = " - ", remove = F)
write.table(existing,
file = "data/processed/21_existing_amino_acid_changes_coordinates_lifted.txt",
quote = F,
sep = "\t",
row.names = F
)
# Number of "independent mutations"
# This means: how many mutations (rows) were there in the dataset?
# This can include some duplicates across studies/chemicals/samples.
# Important to include as a basis for hotspot analysis.
snvs_ins_del_clean %>%
count() # There are 6,465
## # A tibble: 1 × 1
## n
## <int>
## 1 6465
# Missense mutations impairing B-gal
snvs_ins_del_clean %>%
dplyr::filter(Consequence == "missense") %>%
count() # There are 2,732
## # A tibble: 1 × 1
## n
## <int>
## 1 2732
# Nonsense mutations
snvs_ins_del_clean %>%
dplyr::filter(Consequence == "nonsense") %>%
count() # There are 2,206
## # A tibble: 1 × 1
## n
## <int>
## 1 2206
# Silent
snvs_ins_del_clean %>%
dplyr::filter(Consequence == "silent") %>%
count() # There are 227
## # A tibble: 1 × 1
## n
## <int>
## 1 227
# SNVs
snvs_ins_del_clean %>%
dplyr::filter(Type == "SNV") %>%
count() # There are 5,147
## # A tibble: 1 × 1
## n
## <int>
## 1 5147
# Missense - this specifically means SNVs
# Put another way:
# How many times did we disrupt the coding sequence of the protein?
Fig1 <- snvs_ins_del_clean %>%
dplyr::filter(Consequence == "missense") %>% # count() There are 2,732
dplyr::group_by(Codon) %>%
tally()
# Supplementary materials - how many unique SNVs?
snvs_ins_del_clean %>%
dplyr::filter(Type == "SNV") %>%
group_by(Position, Ref, Alt) %>%
count()
## # A tibble: 1,399 × 4
## # Groups: Position, Ref, Alt [1,399]
## Position Ref Alt n
## <dbl> <chr> <chr> <int>
## 1 4 A C 1
## 2 6 C A 1
## 3 19 T C 3
## 4 20 C A 2
## 5 20 C G 2
## 6 21 A C 3
## 7 25 G T 6
## 8 26 A C 1
## 9 27 A C 1
## 10 27 A T 1
## # ℹ 1,389 more rows
# There are 1,399
# Of the 1,399, how many mutations does that refer to?
snvs_ins_del_clean %>%
dplyr::filter(Type == "SNV") %>%
group_by(Position, Ref, Alt) %>%
count() %>%
pull(n) %>%
sum()
## [1] 5147
# There are 5,147
# How many unique SNV missense mutations?
snvs_ins_del_clean %>%
dplyr::filter(Type == "SNV") %>%
dplyr::filter(Consequence == "missense") %>%
group_by(Position, Ref, Alt) %>%
count() # There are 895
## # A tibble: 895 × 4
## # Groups: Position, Ref, Alt [895]
## Position Ref Alt n
## <dbl> <chr> <chr> <int>
## 1 4 A C 1
## 2 19 T C 3
## 3 26 A C 1
## 4 27 A C 1
## 5 27 A T 1
## 6 29 T G 1
## 7 30 C G 1
## 8 41 C A 1
## 9 51 A C 1
## 10 52 C A 2
## # ℹ 885 more rows
# How many residues are impacted by a functional change in all the data?
snvs_ins_del_clean %>%
dplyr::group_by(Codon) %>%
dplyr::filter(FunctionalChange == 1) %>%
tally() %>%
count() # There are 633
## # A tibble: 1 × 1
## n
## <int>
## 1 633
# How many codons identified by Sanger?
sanger <- snvs_ins_del_clean %>%
dplyr::filter(Consequence == "missense") %>% # count() There are 2,732
dplyr::group_by(Codon) %>%
dplyr::filter(Technology == "Sanger")
sanger %>% tally()
## # A tibble: 266 × 2
## Codon n
## <dbl> <int>
## 1 15 1
## 2 19 1
## 3 25 1
## 4 30 1
## 5 47 1
## 6 54 1
## 7 65 1
## 8 67 1
## 9 73 2
## 10 81 1
## # ℹ 256 more rows
# How many codons identified by NGS?
ngs <- snvs_ins_del_clean %>%
dplyr::filter(Consequence == "missense") %>% # count() There are 2,732
dplyr::group_by(Codon) %>%
dplyr::filter(Technology == "NGS")
ngs %>% tally()
## # A tibble: 384 × 2
## Codon n
## <dbl> <int>
## 1 2 1
## 2 7 3
## 3 8 6
## 4 11 1
## 5 12 2
## 6 16 1
## 7 24 1
## 8 31 2
## 9 32 1
## 10 34 1
## # ℹ 374 more rows
nrow(Fig1)
## [1] 492
nrow(Fig1) / length(laczref_aa[[1]])
## [1] 0.4795322
ggplot(Fig1, aes(x = Codon, y = n)) +
geom_bar(stat = "identity") +
theme_bw()
# How many codons in spontaneous mutants?
controls <- snvs_ins_del_clean %>%
dplyr::filter(Consequence == "missense") %>% # count() There are 2,732
dplyr::group_by(Codon) %>%
dplyr::filter(Exposure == "Control")
nrow(controls)
## [1] 859
n_groups(controls) # There are 204 missense mutations found in controls
## [1] 204
group_size(controls)
## [1] 1 1 1 1 1 1 1 4 1 1 2 1 1 2 1 1 2 2 1 2 1 9 1 2 1
## [26] 1 3 1 1 1 1 1 1 1 7 26 1 23 1 1 1 1 2 1 1 1 1 1 2 19
## [51] 2 2 2 1 1 1 7 1 2 1 1 54 1 1 1 1 2 1 1 1 4 1 67 3 23
## [76] 1 1 1 2 1 9 1 5 2 2 3 2 1 1 2 9 1 1 1 1 1 1 8 10 1
## [101] 5 1 1 1 5 2 1 1 1 1 1 1 5 3 5 2 7 1 1 1 1 2 1 2 59
## [126] 5 2 1 3 3 6 4 6 1 1 10 4 10 1 1 1 5 1 2 31 3 1 1 1 1
## [151] 1 1 1 1 1 1 2 2 1 1 1 3 54 2 16 1 1 1 1 1 1 1 1 1 28
## [176] 1 1 2 1 25 1 4 1 7 47 1 1 1 5 4 1 5 2 3 1 1 2 1 1 1
## [201] 2 2 1 1
# How many codons in mutagen exposed samples?
non_controls <- snvs_ins_del_clean %>%
dplyr::filter(Consequence == "missense") %>%
dplyr::group_by(Codon) %>%
dplyr::filter(!Exposure == "Control")
nrow(non_controls)
## [1] 1873
n_groups(non_controls) # 439 missense mutations in non-controls
## [1] 439
group_size(non_controls)
## [1] 1 2 5 1 2 1 1 1 1 2 1 1 1 3 1 1 2 2 2 1 1 1 3 1 2
## [26] 2 4 3 1 1 2 1 1 1 1 3 2 2 1 2 1 1 1 1 4 1 2 1 1 1
## [51] 3 4 5 1 1 1 1 2 1 1 2 5 8 1 8 9 1 2 3 1 3 1 3 1 2
## [76] 4 2 2 2 2 1 2 5 2 3 29 11 7 1 6 33 1 13 2 4 1 2 2 1 1
## [101] 1 1 1 1 1 1 1 1 2 1 3 3 1 3 1 1 1 1 2 2 4 2 1 1 1
## [126] 2 1 1 1 19 2 1 13 1 1 1 1 1 1 1 4 1 13 16 2 1 1 2 2 16
## [151] 1 6 9 4 18 1 1 1 1 1 1 1 8 3 2 1 1 1 8 2 8 2 33 20 16
## [176] 1 1 1 1 9 3 6 20 2 8 7 1 15 1 8 3 1 1 4 1 1 2 4 1 3
## [201] 24 1 1 1 5 5 1 1 8 15 17 12 6 8 2 1 1 1 1 4 1 1 5 2 2
## [226] 3 1 1 1 2 2 1 1 18 17 21 5 9 2 1 1 3 1 1 1 1 7 7 2 1
## [251] 30 1 1 62 11 2 4 14 13 4 24 1 2 2 1 1 1 1 1 2 17 20 2 44 3
## [276] 8 3 1 1 4 3 2 1 2 3 6 3 2 28 3 1 1 2 3 1 2 1 1 1 1
## [301] 1 3 2 1 2 1 1 1 2 1 2 1 1 1 2 1 1 1 1 5 2 1 1 1 2
## [326] 2 2 1 1 1 1 1 1 1 2 1 1 1 2 4 88 2 2 1 18 33 2 1 2 4
## [351] 2 1 3 1 1 1 1 2 2 2 3 1 1 1 1 1 13 25 1 6 2 1 1 1 1
## [376] 3 4 3 42 15 1 1 10 2 2 16 44 2 1 1 1 1 2 1 10 13 3 7 11 2
## [401] 16 1 1 20 2 1 2 1 1 7 12 5 2 1 1 1 2 1 1 1 1 1 1 3 1
## [426] 3 7 5 1 2 2 1 1 2 1 1 1 1 1
# Codons common to controls and experimental samples
# There are 151 overlapping
aa_inboth_ctrl_exp <- dplyr::intersect(
controls %>% dplyr::pull(Codon),
non_controls %>% dplyr::pull(Codon)
)
dplyr::left_join(
controls %>% dplyr::select(Codon),
non_controls %>% dplyr::select(Codon)
) %>%
distinct() %>%
pull()
## [1] 7 55 62 77 91 124 146 148 154 194 201 207 272 301 304
## [16] 323 331 353 354 355 357 358 375 379 387 388 389 390 391 393
## [31] 403 406 412 414 418 419 439 440 457 459 460 461 462 463 488
## [46] 500 502 503 505 532 535 537 540 541 542 543 544 545 546 547
## [61] 564 565 568 570 589 602 605 640 652 670 673 691 786 791 792
## [76] 806 832 881 890 897 898 899 901 908 909 914 932 933 935 938
## [91] 942 950 951 993 994 134 135 203 449 651 770 921 324 430 487
## [106] 501 528 611 775 810 952 8 181 313 386 592 763 404 112 166
## [121] 504 561 73 569 485 416 782 210 497 573 863 105 190 360 489
## [136] 169 436 780 934 800 19 145 446 200 206 254 268 295 359 362
## [151] 453 622 872 245 624 773 796 883 941 997 448 149 193 294 296
## [166] 299 352 366 395 405 452 474 492 498 509 515 557 712 733 797
## [181] 398 407 925 946 116 533 904 422 198 356 900 47 54 100 139
## [196] 164 328 986 165 281 852 1022 25 524
dplyr::right_join(
controls %>% dplyr::select(Codon),
non_controls %>% dplyr::select(Codon)
) %>%
distinct() %>%
pull()
## [1] 7 62 91 124 146 148 194 201 207 272 301 304 331 353 354
## [16] 355 357 358 375 387 388 389 390 391 393 403 406 412 414 418
## [31] 419 439 457 459 460 461 462 463 488 500 502 503 505 532 537
## [46] 540 541 542 543 544 545 546 547 564 565 568 570 589 602 605
## [61] 640 652 670 691 786 791 792 806 881 890 897 898 899 901 908
## [76] 909 932 933 935 938 942 951 993 994 134 135 203 449 324 430
## [91] 487 501 528 611 810 952 8 313 386 592 763 404 504 561 73
## [106] 569 485 416 210 497 573 489 436 780 934 145 446 200 206 254
## [121] 295 359 453 622 883 941 997 149 193 296 299 452 498 509 557
## [136] 712 398 533 904 422 198 356 47 54 100 139 328 986 165 852
## [151] 524 11 31 32 35 38 40 44 46 49 50 52 53 65 67
## [166] 70 71 82 84 87 96 97 98 103 119 120 121 137 138 152
## [181] 155 160 162 168 173 184 185 186 199 204 205 208 216 238 251
## [196] 255 262 269 270 298 302 303 310 319 333 336 345 346 372 380
## [211] 381 396 400 402 441 451 464 469 482 490 495 507 512 513 538
## [226] 553 590 599 610 616 623 628 703 705 707 716 722 725 736 750
## [241] 757 777 779 781 785 787 788 805 812 841 879 880 886 889 892
## [256] 902 905 907 912 928 930 936 939 947 949 953 954 961 970 976
## [271] 981 987 995 999 1001 1005 1017 1020 2 12 34 94 104 211 267
## [286] 287 288 361 499 687 688 692 809 815 823 854 966 967 998 16
## [301] 74 88 212 217 244 256 258 259 365 567 603 604 606 778 817
## [316] 840 884 895 991 1000 224 326 749 774 882 102 226 279 363 522
## [331] 608 634 984 45 236 261 466 521 702 740 741 748 311 494 555
## [346] 745 931 937 948 99 101 626 847 43 192 325 420 496 1015 24
## [361] 180 197 465 539 549 594 597 629 916 943 698 1014 370 242 215
## [376] 342 776 228 30 86 118 161 344 423 433 468 534 572 587 708
## [391] 790 826 383 122 399 417 520 552 919 929 958 974 1002 15 81
## [406] 221 364 554 559 766 548 411 859 213 222 315 385 415 425 429
## [421] 456 607 662 709 819 870 129 147 264 384 421 484 676 874 940
## [436] 903 836 280 699
# How many codons in spontaneous mutants excluding those found in mutagen treated samples?
# This calculation only includes missense!
control_aa <- controls %>%
dplyr::select(Codon) %>%
pull()
non_control_aa <- non_controls %>%
dplyr::select(Codon) %>%
pull()
control_aa[!control_aa %in% non_control_aa] %>% unique()
## [1] 55 77 154 323 379 440 535 673 832 914 950 651 770 921 775
## [16] 181 112 166 782 863 105 190 360 169 800 19 268 362 872 245
## [31] 624 773 796 448 294 352 366 395 405 474 492 515 733 797 407
## [46] 925 946 116 900 164 281 1022 25
controls %>% filter(!Codon %in% (non_controls %>% pull(Codon))) # Another way to calculate it
## # A tibble: 63 × 41
## # Groups: Codon [53]
## Exposure Tissue Dose Position Ref Alt `Tech Rep1` `Tech Rep2`
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 Control Bone Marrow 0 182 A C 0.0327 0.0239
## 2 Control Bone Marrow 0 247 G T 0.0376 0.036
## 3 Control Bone Marrow 0 478 T C 0.0106 0.011
## 4 Control Bone Marrow 0 478 T C 0.0396 0.0453
## 5 Control Bone Marrow 0 986 T A 0.136 0.128
## 6 Control Bone Marrow 0 1154 T G 0.0043 0.0042
## 7 Control Bone Marrow 0 1336 G T 0.0249 0.0253
## 8 Control Bone Marrow 0 1622 T G 0.0085 0.0089
## 9 Control Bone Marrow 0 1622 T G 0.0337 0.0319
## 10 Control Bone Marrow 0 2035 G A 0.0503 0.0681
## # ℹ 53 more rows
## # ℹ 33 more variables: `Tech Difference` <dbl>, Background <dbl>,
## # `Avg Freq` <dbl>, Count <dbl>, `A:T to G:C` <dbl>, `G:C to A:T` <dbl>,
## # `G:C to T:A` <dbl>, `G:C to C:G` <dbl>, `A:T to T:A` <dbl>,
## # `A:T to C:G` <dbl>, Insertion <dbl>, Deletion <dbl>, Codon <dbl>,
## # Consequence <chr>, `Ref Codon` <chr>, `Alt Codon` <chr>, `Ref A.A.` <chr>,
## # `Alt A.A.` <chr>, Type <chr>, Study <chr>, Technology <chr>, …
# There are 63 of these:
controls_only_missense <- controls %>% filter(!Codon %in% (non_controls %>% pull(Codon)))
# Hotspot analysis
snvs_only <- snvs_ins_del_clean %>%
dplyr::filter(Type == "SNV") # %>% dplyr::filter(Consequence == "missense")
# If not limited to missense, then nonsense mutations, etc., are included;
# but all consequences are important for counting where hotspots are.'
snvs_only %>% count() # Same as above, 5,147
## # A tibble: 1 × 1
## n
## <int>
## 1 5147
counts_per_nucleotide <- snvs_only %>%
dplyr::group_by(Position) %>%
dplyr::tally() %>%
pull(n)
mean(counts_per_nucleotide)
## [1] 4.892586
sd(counts_per_nucleotide)
## [1] 8.932928
mean(counts_per_nucleotide) + sd(counts_per_nucleotide)
## [1] 13.82551
hotspot_cutoff_value <- mean(counts_per_nucleotide) + sd(counts_per_nucleotide)
hotspots <- snvs_only %>%
dplyr::group_by(Position) %>%
dplyr::add_count() %>%
dplyr::ungroup() %>%
dplyr::filter(n >= hotspot_cutoff_value) # changed from 10 to 14
# mean number of mutations per nucleotide position
hotspots %>% count() # There are 1,256 mutations at missense hotspots (i.e., SNV and missense mutations that is)
## # A tibble: 1 × 1
## n
## <int>
## 1 2069
# There are 2,069 when including all consequences
hotspots %>%
dplyr::group_by(Position) %>%
tally() %>%
arrange(-n)
## # A tibble: 74 × 2
## Position n
## <dbl> <int>
## 1 1187 112
## 2 2374 106
## 3 1627 91
## 4 1072 90
## 5 1090 81
## 6 2713 56
## 7 1831 48
## 8 2743 46
## 9 2744 45
## 10 928 44
## # ℹ 64 more rows
# There are 74 hotspots when including all consequences
# There are 39 missense hotspot nucleotides with a count >13 independent mutations
# 136 if using a cutoff of 10 as done originally without Sanger data
ranked_hotspots_all <- hotspots %>%
dplyr::group_by(Position) %>%
tally() %>%
arrange(-n)
# ranked_hotspots_ngs$Position <- factor(ranked_hotspots_ngs$Position,
# levels = ranked_hotspots_ngs$Position)
#
# ggplot(ranked_hotspots_ngs %>% head(n = 20), aes(x = Position, y = n, group = Position)) +
# geom_bar(stat = "identity") +
# theme(axis.text.x = element_text(angle = 90))
ranked_hotspots_all$Position <- factor(ranked_hotspots_all$Position,
levels = ranked_hotspots_all$Position
)
ggplot(ranked_hotspots_all %>% head(n = 20), aes(x = Position, y = n, group = Position)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90))
hs <- hotspots$Position %>% unique()
hotspot_ranges <- GRanges(
seqnames = "lacZ",
ranges = IRanges(
start = start(ranges(IRanges(
start = hs,
end = hs
))),
end = end(ranges(IRanges(
start = hs,
end = hs
)))
)
)
CpGs_in_hotspots <- plyranges::find_overlaps(hotspot_ranges, CpG_sites_ranges)
CpGs_in_hotspots <- as.data.frame(CpGs_in_hotspots) # %>% dplyr::filter(Type == "SNV")
CpGs_in_hotspots
## seqnames start end width strand
## 1 lacZ 461 461 1 *
## 2 lacZ 637 637 1 *
## 3 lacZ 928 928 1 *
## 4 lacZ 1016 1016 1 *
## 5 lacZ 1090 1090 1 *
## 6 lacZ 1187 1187 1 *
## 7 lacZ 1196 1196 1 *
## 8 lacZ 1334 1334 1 *
## 9 lacZ 1520 1520 1 *
## 10 lacZ 1627 1627 1 *
## 11 lacZ 1638 1638 1 *
## 12 lacZ 1831 1831 1 *
## 13 lacZ 2374 2374 1 *
## 14 lacZ 2375 2375 1 *
## 15 lacZ 2392 2392 1 *
## 16 lacZ 2659 2659 1 *
## 17 lacZ 2713 2713 1 *
## 18 lacZ 2740 2740 1 *
## 19 lacZ 2743 2743 1 *
## 20 lacZ 2744 2744 1 *
## 21 lacZ 2813 2813 1 *
## 22 lacZ 2817 2817 1 *
## 23 lacZ 2835 2835 1 *
## 24 lacZ 2840 2840 1 *
## 25 lacZ 136 136 1 *
## 26 lacZ 187 187 1 *
## 27 lacZ 303 303 1 *
## 28 lacZ 436 436 1 *
## 29 lacZ 501 501 1 *
## 30 lacZ 759 759 1 *
## 31 lacZ 1018 1018 1 *
## 32 lacZ 1224 1224 1 *
## 33 lacZ 1233 1233 1 *
## 34 lacZ 1342 1342 1 *
## 35 lacZ 1388 1388 1 *
## 36 lacZ 1527 1527 1 *
## 37 lacZ 1626 1626 1 *
## 38 lacZ 1739 1739 1 *
## 39 lacZ 1775 1775 1 *
## 40 lacZ 1782 1782 1 *
## 41 lacZ 2266 2266 1 *
## 42 lacZ 2473 2473 1 *
## 43 lacZ 2805 2805 1 *
## 44 lacZ 3029 3029 1 *
## 45 lacZ 1072 1072 1 *
hotspot_cpg_muts <- snvs_ins_del_clean %>% dplyr::filter(Position %in% CpGs_in_hotspots$start)
hotspot_cpg_muts
## # A tibble: 1,566 × 41
## Exposure Tissue Dose Position Ref Alt `Tech Rep1` `Tech Rep2`
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 BaP Bone Marrow 100 461 C G 0.0043 0.0047
## 2 BaP Bone Marrow 100 461 C G 0.007 0.0084
## 3 BaP Bone Marrow 100 461 C G 0.0029 0.0023
## 4 BaP Bone Marrow 100 637 G T 0.0021 0.0021
## 5 BaP Bone Marrow 100 637 G T 0.0029 0.0026
## 6 BaP Bone Marrow 100 637 G T 0.0075 0.0083
## 7 BaP Bone Marrow 100 637 G C 0.0026 0.0022
## 8 BaP Bone Marrow 100 637 G C 0.0029 0.0025
## 9 BaP Bone Marrow 100 637 G C 0.0026 0.0021
## 10 BaP Bone Marrow 100 637 G C 0.0023 0.0031
## # ℹ 1,556 more rows
## # ℹ 33 more variables: `Tech Difference` <dbl>, Background <dbl>,
## # `Avg Freq` <dbl>, Count <dbl>, `A:T to G:C` <dbl>, `G:C to A:T` <dbl>,
## # `G:C to T:A` <dbl>, `G:C to C:G` <dbl>, `A:T to T:A` <dbl>,
## # `A:T to C:G` <dbl>, Insertion <dbl>, Deletion <dbl>, Codon <dbl>,
## # Consequence <chr>, `Ref Codon` <chr>, `Alt Codon` <chr>, `Ref A.A.` <chr>,
## # `Alt A.A.` <chr>, Type <chr>, Study <chr>, Technology <chr>, …
# There are 45 hotspots that overlap with CpG sites
nrow(CpGs_in_hotspots) / length(hs)
## [1] 0.6081081
# Hotspots in NGS data
snvs_only_ngs <- snvs_ins_del_clean %>%
dplyr::filter(Type == "SNV") %>%
dplyr::filter(Technology == "NGS")
hotspots_ngs <- snvs_only_ngs %>%
dplyr::group_by(Position) %>%
dplyr::add_count() %>%
dplyr::filter(n >= hotspot_cutoff_value)
hotspots_ngs %>%
dplyr::group_by(Position) %>%
tally()
## # A tibble: 52 × 2
## Position n
## <dbl> <int>
## 1 303 20
## 2 421 17
## 3 461 26
## 4 501 39
## 5 610 14
## 6 637 18
## 7 638 18
## 8 928 29
## 9 1016 15
## 10 1018 22
## # ℹ 42 more rows
hotspots_ngs %>%
group_by(Position, `Ref Codon`) %>%
tally() %>%
group_by(`Ref Codon`) %>%
tally()
## # A tibble: 18 × 2
## `Ref Codon` n
## <chr> <int>
## 1 CAC 2
## 2 CAG 1
## 3 CCG 2
## 4 CGA 2
## 5 CGC 4
## 6 CGG 1
## 7 CGT 2
## 8 GAA 5
## 9 GAC 2
## 10 GAG 4
## 11 GGA 2
## 12 GGC 6
## 13 GGG 1
## 14 GGT 1
## 15 TAC 8
## 16 TCG 5
## 17 TGC 2
## 18 TGG 2
ranked_hotspots_ngs <- hotspots_ngs %>%
dplyr::group_by(Position) %>%
tally() %>%
arrange(-n)
counts_per_nucleotide <- snvs_only_ngs %>%
dplyr::group_by(Position) %>%
dplyr::tally() %>%
pull(n)
mean(counts_per_nucleotide)
## [1] 4.235028
sd(counts_per_nucleotide)
## [1] 5.856226
mean(counts_per_nucleotide) + sd(counts_per_nucleotide)
## [1] 10.09125
# 692 codons mutated in total
snvs_ins_del_clean %>%
dplyr::group_by(Codon) %>%
tally()
## # A tibble: 692 × 2
## Codon n
## <dbl> <int>
## 1 2 2
## 2 7 10
## 3 8 16
## 4 11 5
## 5 12 9
## 6 15 1
## 7 16 19
## 8 17 2
## 9 19 3
## 10 23 10
## # ℹ 682 more rows
# 492 codons with missense mutations
snvs_ins_del_clean %>%
dplyr::group_by(Codon) %>%
dplyr::filter(Consequence == "missense") %>%
tally()
## # A tibble: 492 × 2
## Codon n
## <dbl> <int>
## 1 2 1
## 2 7 3
## 3 8 6
## 4 11 1
## 5 12 2
## 6 15 1
## 7 16 1
## 8 19 1
## 9 24 1
## 10 25 1
## # ℹ 482 more rows
# 605 codons mutated in NGS data
snvs_only_ngs %>%
dplyr::group_by(Codon) %>%
tally()
## # A tibble: 605 × 2
## Codon n
## <dbl> <int>
## 1 2 2
## 2 7 8
## 3 8 14
## 4 11 3
## 5 12 6
## 6 16 13
## 7 17 2
## 8 19 2
## 9 23 5
## 10 24 1
## # ℹ 595 more rows
# 384 codons with missense mutations in NGS data
snvs_only_ngs %>%
dplyr::group_by(Codon) %>%
dplyr::filter(Consequence == "missense") %>%
tally()
## # A tibble: 384 × 2
## Codon n
## <dbl> <int>
## 1 2 1
## 2 7 3
## 3 8 6
## 4 11 1
## 5 12 2
## 6 16 1
## 7 24 1
## 8 31 2
## 9 32 1
## 10 34 1
## # ℹ 374 more rows
# 266 codons identified by Sanger
sanger %>%
dplyr::group_by(Codon) %>%
dplyr::filter(Consequence == "missense") %>%
tally()
## # A tibble: 266 × 2
## Codon n
## <dbl> <int>
## 1 15 1
## 2 19 1
## 3 25 1
## 4 30 1
## 5 47 1
## 6 54 1
## 7 65 1
## 8 67 1
## 9 73 2
## 10 81 1
## # ℹ 256 more rows
# How many codons found with both technologies?
dplyr::inner_join(
sanger %>% dplyr::select(Codon),
ngs %>% dplyr::select(Codon)
) %>%
distinct() # 158
## # A tibble: 158 × 1
## # Groups: Codon [158]
## Codon
## <dbl>
## 1 148
## 2 391
## 3 540
## 4 880
## 5 881
## 6 941
## 7 994
## 8 355
## 9 786
## 10 908
## # ℹ 148 more rows
# Calculate mutations per codon
counts_per_codon <- snvs_ins_del_clean %>%
dplyr::group_by(Codon) %>%
dplyr::filter(Consequence == "missense") %>%
dplyr::tally() %>%
pull(n)
mean(counts_per_codon)
## [1] 5.552846
sd(counts_per_codon)
## [1] 13.07067
mean(counts_per_codon) + sd(counts_per_codon)
## [1] 18.62352
hotspot_cutoff_value_codon <- mean(counts_per_codon) + sd(counts_per_codon)
# Codon hotspots
# NGS only - not used in study
hotspots_ngs_codons <- snvs_only_ngs %>%
dplyr::group_by(Codon) %>%
dplyr::filter(Consequence == "missense") %>%
dplyr::add_count() %>%
dplyr::ungroup() %>%
dplyr::filter(n >= hotspot_cutoff_value_codon)
hotspots_ngs_codons %>%
dplyr::group_by(Codon) %>%
tally()
## # A tibble: 24 × 2
## Codon n
## <dbl> <int>
## 1 201 28
## 2 207 36
## 3 301 19
## 4 304 19
## 5 353 20
## 6 358 21
## 7 390 31
## 8 393 19
## 9 406 26
## 10 439 22
## # ℹ 14 more rows
ranked_hotspots_codons <- hotspots_ngs_codons %>%
dplyr::group_by(Codon) %>%
tally() %>%
arrange(-n)
ranked_hotspots_codons$Codon <- factor(ranked_hotspots_codons$Codon,
levels = ranked_hotspots_codons$Codon
)
codons_of_interest <- ranked_hotspots_codons %>%
head(n = 37) %>%
pull(Codon)
ggplot(ranked_hotspots_codons %>% head(n = 37), aes(x = Codon, y = n, group = Codon)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90))
# Codon hotspots - all
hotspots_codons <- snvs_ins_del_clean %>%
# dplyr::filter(Type=="SNV") %>%
dplyr::filter(Consequence == "missense") %>%
dplyr::group_by(Codon) %>%
dplyr::add_count() %>%
dplyr::ungroup() %>%
dplyr::filter(n >= hotspot_cutoff_value_codon)
hotspots_codons_2_independent <- snvs_ins_del_clean %>%
# dplyr::filter(Type=="SNV") %>%
dplyr::filter(Consequence == "missense") %>%
dplyr::filter(FunctionalChange == 1) %>%
dplyr::group_by(Codon) %>%
dplyr::add_count() %>%
dplyr::filter(n >= 2)
# In all the data, how many missense mutations affect
hotspots_codons_2_independent %>%
group_by(Codon) %>%
tally()
## # A tibble: 264 × 2
## Codon n
## <dbl> <int>
## 1 7 3
## 2 8 6
## 3 12 2
## 4 31 2
## 5 38 3
## 6 44 2
## 7 45 2
## 8 46 2
## 9 47 2
## 10 52 3
## # ℹ 254 more rows
codons_to_visualize <- hotspots_codons_2_independent %>%
pull(Codon) %>%
unique()
codons_to_visualize
## [1] 7 8 31 38 44 46 52 54 62 65 67 87 119 120 121
## [16] 139 145 148 149 155 160 162 168 184 185 186 193 194 199 200
## [31] 201 203 204 206 207 210 216 251 254 255 269 270 272 295 301
## [46] 302 304 328 331 333 336 345 346 353 355 356 357 358 375 380
## [61] 381 386 387 388 389 390 391 393 402 403 404 406 412 418 419
## [76] 436 439 446 451 452 459 461 463 464 482 489 490 498 501 502
## [91] 504 505 507 509 513 528 532 537 540 541 543 544 545 547 564
## [106] 565 568 569 570 573 589 590 599 605 610 611 622 640 670 705
## [121] 716 722 777 780 781 785 786 787 788 791 792 805 812 841 852
## [136] 880 881 899 901 905 907 908 909 912 930 932 933 934 935 936
## [151] 938 941 942 947 952 953 954 987 994 995 999 12 91 94 146
## [166] 211 324 449 546 691 809 810 883 890 897 967 997 998 124 154
## [181] 323 354 414 457 460 462 488 500 503 535 542 602 652 806 898
## [196] 950 951 993 88 100 198 212 256 259 359 416 487 533 567 603
## [211] 604 606 817 840 884 895 904 47 326 422 73 102 165 279 299
## [226] 398 561 763 45 741 748 134 135 770 712 745 937 430 524 847
## [241] 192 497 313 592 180 197 549 597 557 698 485 215 112 166 782
## [256] 453 296 986 86 572 552 1002 411 429
codons_to_visualize %>% length()
## [1] 264
# There are 263... or 264 if you look at missense not limited to SNVs
paste(codons_to_visualize, collapse = "+") # For PyMol
## [1] "7+8+31+38+44+46+52+54+62+65+67+87+119+120+121+139+145+148+149+155+160+162+168+184+185+186+193+194+199+200+201+203+204+206+207+210+216+251+254+255+269+270+272+295+301+302+304+328+331+333+336+345+346+353+355+356+357+358+375+380+381+386+387+388+389+390+391+393+402+403+404+406+412+418+419+436+439+446+451+452+459+461+463+464+482+489+490+498+501+502+504+505+507+509+513+528+532+537+540+541+543+544+545+547+564+565+568+569+570+573+589+590+599+605+610+611+622+640+670+705+716+722+777+780+781+785+786+787+788+791+792+805+812+841+852+880+881+899+901+905+907+908+909+912+930+932+933+934+935+936+938+941+942+947+952+953+954+987+994+995+999+12+91+94+146+211+324+449+546+691+809+810+883+890+897+967+997+998+124+154+323+354+414+457+460+462+488+500+503+535+542+602+652+806+898+950+951+993+88+100+198+212+256+259+359+416+487+533+567+603+604+606+817+840+884+895+904+47+326+422+73+102+165+279+299+398+561+763+45+741+748+134+135+770+712+745+937+430+524+847+192+497+313+592+180+197+549+597+557+698+485+215+112+166+782+453+296+986+86+572+552+1002+411+429"
hotspots_codons %>%
dplyr::group_by(Codon) %>%
tally()
## # A tibble: 33 × 2
## Codon n
## <dbl> <int>
## 1 201 36
## 2 203 37
## 3 207 56
## 4 301 21
## 5 304 32
## 6 353 23
## 7 358 72
## 8 390 100
## 9 391 23
## 10 393 39
## # ℹ 23 more rows
ranked_hotspots_codons_all <- hotspots_codons %>%
dplyr::group_by(Codon) %>%
tally() %>%
arrange(-n)
ranked_hotspots_codons_all$Codon <- factor(ranked_hotspots_codons_all$Codon,
levels = ranked_hotspots_codons_all$Codon
)
# 33 codons of interest, top ranked hot spots
codons_of_interest <- ranked_hotspots_codons_all %>%
head(n = 33) %>%
pull(Codon)
codons_of_interest[codons_of_interest %in% existing$aa_pos]
## [1] 537 540 568 201 503 391
## 33 Levels: 786 390 909 537 358 540 899 605 207 568 881 792 393 203 201 ... 901
existing[existing$aa_pos %in% codons_of_interest, ]
## nuc_start nuc_end wt_aa aa_pos ref_nuc_range ref_nuc_start ref_nuc_end
## 2 619 621 Asp 201 601 - 603 601 603
## 4 1189 1191 His 391 1171 - 1173 1171 1173
## 9 1525 1527 Tyr 503 1507 - 1509 1507 1509
## 11 1627 1629 Glu 537 1609 - 1611 1609 1611
## 12 1636 1638 His 540 1618 - 1620 1618 1620
## 14 1720 1722 Trp 568 1702 - 1704 1702 1704
paste(codons_of_interest, collapse = "+") # For PyMol
## [1] "786+390+909+537+358+540+899+605+207+568+881+792+393+203+201+439+304+547+406+564+503+459+565+353+391+501+908+941+301+502+791+545+901"
ggplot(ranked_hotspots_codons_all %>% head(n = 33), aes(x = Codon, y = n, group = Codon)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90))
hotspots_codons_summary <- hotspots_codons %>%
dplyr::group_by(residue_name, Domain) %>%
tally() %>%
arrange(Domain, -n)
hotspots_codons_summary
## # A tibble: 34 × 3
## # Groups: residue_name [34]
## residue_name Domain n
## <chr> <chr> <int>
## 1 Gly207 Sugar Binding (PF02837) 56
## 2 Trp203 Sugar Binding (PF02837) 37
## 3 Asp201 Sugar Binding (PF02837) 36
## 4 Ser390 TIM Barrel (PF02836) 100
## 5 Glu537 TIM Barrel (PF02836) 89
## 6 Glu358 TIM Barrel (PF02836) 72
## 7 His540 TIM Barrel (PF02836) 67
## 8 Gly605 TIM Barrel (PF02836) 59
## 9 Trp568 TIM Barrel (PF02836) 54
## 10 Pro393 TIM Barrel (PF02836) 39
## # ℹ 24 more rows
write.table(hotspots_codons_summary,
file = "data/processed/table2.txt",
quote = F, sep = "\t", row.names = F
)
table_S13 <- snvs_ins_del_clean %>%
# dplyr::filter(Type=="SNV") %>%
dplyr::filter(Consequence == "missense") %>%
dplyr::group_by(Codon) %>%
dplyr::add_count() %>%
dplyr::ungroup() %>%
dplyr::group_by(residue_name, Domain) %>%
tally() %>%
arrange(Domain, -n)
write.table(table_S13,
file = "data/processed/table_s13.txt",
quote = F, sep = "\t", row.names = F
)
# Glutamine 537 example
snvs_ins_del_clean %>%
filter(Codon == 537) %>%
filter(Consequence == "missense")
## # A tibble: 89 × 41
## Exposure Tissue Dose Position Ref Alt `Tech Rep1` `Tech Rep2`
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 BaP Bone Marrow 100 1627 G A 0.0101 0.01
## 2 BaP Bone Marrow 100 1627 G C 0.0026 0.0025
## 3 BaP Bone Marrow 100 1627 G C 0.0029 0.0026
## 4 BaP Bone Marrow 100 1627 G C 0.0031 0.0034
## 5 BaP Bone Marrow 100 1627 G A 0.0047 0.004
## 6 Control Bone Marrow 0 1627 G A 0.746 0.746
## 7 Control Bone Marrow 0 1627 G A 0.272 0.276
## 8 Control Bone Marrow 0 1627 G A 0.0298 0.0305
## 9 Control Bone Marrow 0 1627 G A 0.171 0.178
## 10 Control Bone Marrow 0 1627 G A 0.210 0.201
## # ℹ 79 more rows
## # ℹ 33 more variables: `Tech Difference` <dbl>, Background <dbl>,
## # `Avg Freq` <dbl>, Count <dbl>, `A:T to G:C` <dbl>, `G:C to A:T` <dbl>,
## # `G:C to T:A` <dbl>, `G:C to C:G` <dbl>, `A:T to T:A` <dbl>,
## # `A:T to C:G` <dbl>, Insertion <dbl>, Deletion <dbl>, Codon <dbl>,
## # Consequence <chr>, `Ref Codon` <chr>, `Alt Codon` <chr>, `Ref A.A.` <chr>,
## # `Alt A.A.` <chr>, Type <chr>, Study <chr>, Technology <chr>, …
snvs_ins_del_clean %>%
filter(Codon == 537) %>%
filter(Consequence == "missense") %>%
filter(Study == "This Study")
## # A tibble: 37 × 41
## Exposure Tissue Dose Position Ref Alt `Tech Rep1` `Tech Rep2`
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 BaP Bone Marrow 100 1627 G A 0.0101 0.01
## 2 BaP Bone Marrow 100 1627 G C 0.0026 0.0025
## 3 BaP Bone Marrow 100 1627 G C 0.0029 0.0026
## 4 BaP Bone Marrow 100 1627 G C 0.0031 0.0034
## 5 BaP Bone Marrow 100 1627 G A 0.0047 0.004
## 6 Control Bone Marrow 0 1627 G A 0.746 0.746
## 7 Control Bone Marrow 0 1627 G A 0.272 0.276
## 8 Control Bone Marrow 0 1627 G A 0.0298 0.0305
## 9 Control Bone Marrow 0 1627 G A 0.171 0.178
## 10 Control Bone Marrow 0 1627 G A 0.210 0.201
## # ℹ 27 more rows
## # ℹ 33 more variables: `Tech Difference` <dbl>, Background <dbl>,
## # `Avg Freq` <dbl>, Count <dbl>, `A:T to G:C` <dbl>, `G:C to A:T` <dbl>,
## # `G:C to T:A` <dbl>, `G:C to C:G` <dbl>, `A:T to T:A` <dbl>,
## # `A:T to C:G` <dbl>, Insertion <dbl>, Deletion <dbl>, Codon <dbl>,
## # Consequence <chr>, `Ref Codon` <chr>, `Alt Codon` <chr>, `Ref A.A.` <chr>,
## # `Alt A.A.` <chr>, Type <chr>, Study <chr>, Technology <chr>, …
# Arg786
snvs_ins_del_clean %>%
filter(Codon == 786) %>%
filter(Consequence == "missense") %>%
filter(Technology == "NGS")
## # A tibble: 96 × 41
## Exposure Tissue Dose Position Ref Alt `Tech Rep1` `Tech Rep2`
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 BaP Bone Marrow 100 2374 C A 0.0309 0.0316
## 2 BaP Bone Marrow 100 2374 C G 0.0144 0.0139
## 3 BaP Bone Marrow 100 2374 C A 0.0497 0.0517
## 4 BaP Bone Marrow 100 2374 C G 0.0029 0.0028
## 5 BaP Bone Marrow 100 2374 C A 0.0093 0.0098
## 6 BaP Bone Marrow 100 2374 C T 0.0076 0.0082
## 7 BaP Bone Marrow 100 2374 C A 0.0344 0.0309
## 8 BaP Bone Marrow 100 2374 C T 0.0035 0.004
## 9 BaP Bone Marrow 100 2374 C G 0.0027 0.0023
## 10 BaP Bone Marrow 100 2374 C G 0.0087 0.0072
## # ℹ 86 more rows
## # ℹ 33 more variables: `Tech Difference` <dbl>, Background <dbl>,
## # `Avg Freq` <dbl>, Count <dbl>, `A:T to G:C` <dbl>, `G:C to A:T` <dbl>,
## # `G:C to T:A` <dbl>, `G:C to C:G` <dbl>, `A:T to T:A` <dbl>,
## # `A:T to C:G` <dbl>, Insertion <dbl>, Deletion <dbl>, Codon <dbl>,
## # Consequence <chr>, `Ref Codon` <chr>, `Alt Codon` <chr>, `Ref A.A.` <chr>,
## # `Alt A.A.` <chr>, Type <chr>, Study <chr>, Technology <chr>, …
# There are 143 mutations at Arg786
# 142 missense, 1 indel
snvs_ins_del_clean %>%
dplyr::filter(Codon == 786) %>%
group_by(Ref, Alt)
## # A tibble: 143 × 41
## # Groups: Ref, Alt [7]
## Exposure Tissue Dose Position Ref Alt `Tech Rep1` `Tech Rep2`
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 BaP Bone Marrow 100 2374 C A 0.0309 0.0316
## 2 BaP Bone Marrow 100 2374 C G 0.0144 0.0139
## 3 BaP Bone Marrow 100 2374 C A 0.0497 0.0517
## 4 BaP Bone Marrow 100 2374 C G 0.0029 0.0028
## 5 BaP Bone Marrow 100 2374 C A 0.0093 0.0098
## 6 BaP Bone Marrow 100 2374 C T 0.0076 0.0082
## 7 BaP Bone Marrow 100 2374 C A 0.0344 0.0309
## 8 BaP Bone Marrow 100 2374 C T 0.0035 0.004
## 9 BaP Bone Marrow 100 2374 C G 0.0027 0.0023
## 10 BaP Bone Marrow 100 2374 C G 0.0087 0.0072
## # ℹ 133 more rows
## # ℹ 33 more variables: `Tech Difference` <dbl>, Background <dbl>,
## # `Avg Freq` <dbl>, Count <dbl>, `A:T to G:C` <dbl>, `G:C to A:T` <dbl>,
## # `G:C to T:A` <dbl>, `G:C to C:G` <dbl>, `A:T to T:A` <dbl>,
## # `A:T to C:G` <dbl>, Insertion <dbl>, Deletion <dbl>, Codon <dbl>,
## # Consequence <chr>, `Ref Codon` <chr>, `Alt Codon` <chr>, `Ref A.A.` <chr>,
## # `Alt A.A.` <chr>, Type <chr>, Study <chr>, Technology <chr>, …
snvs_ins_del_clean %>%
dplyr::filter(Codon == 786) %>%
group_by(Ref, Alt) %>%
tally() %>%
pull(n) %>%
sum()
## [1] 143
# There are 113 mutations at Ser390
snvs_ins_del_clean %>%
dplyr::filter(Codon == 390) %>%
group_by(Ref, Alt)
## # A tibble: 113 × 41
## # Groups: Ref, Alt [4]
## Exposure Tissue Dose Position Ref Alt `Tech Rep1` `Tech Rep2`
## <chr> <chr> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 BaP Bone Marrow 100 1187 C T 0.0038 0.0043
## 2 BaP Bone Marrow 100 1187 C G 0.0044 0.0037
## 3 BaP Bone Marrow 100 1187 C A 0.0028 0.0028
## 4 BaP Bone Marrow 100 1187 C A 0.0044 0.0043
## 5 BaP Bone Marrow 100 1187 C A 0.009 0.0087
## 6 BaP Bone Marrow 100 1187 C A 0.0117 0.0122
## 7 BaP Bone Marrow 100 1187 C A 0.0053 0.005
## 8 BaP Bone Marrow 100 1188 G C 0.0025 0.0026
## 9 BaP-IU Bone Marrow 20 1187 C T 0.230 0.175
## 10 BaP-IU Bone Marrow 40 1187 C T 0.0183 0.0154
## # ℹ 103 more rows
## # ℹ 33 more variables: `Tech Difference` <dbl>, Background <dbl>,
## # `Avg Freq` <dbl>, Count <dbl>, `A:T to G:C` <dbl>, `G:C to A:T` <dbl>,
## # `G:C to T:A` <dbl>, `G:C to C:G` <dbl>, `A:T to T:A` <dbl>,
## # `A:T to C:G` <dbl>, Insertion <dbl>, Deletion <dbl>, Codon <dbl>,
## # Consequence <chr>, `Ref Codon` <chr>, `Alt Codon` <chr>, `Ref A.A.` <chr>,
## # `Alt A.A.` <chr>, Type <chr>, Study <chr>, Technology <chr>, …
snvs_ins_del_clean %>%
dplyr::filter(Codon == 390) %>%
group_by(Ref, Alt) %>%
tally() %>%
pull(n) %>%
sum()
## [1] 113
snvs_ins_del_clean %>%
dplyr::filter(PositionRef == 1169) %>%
group_by(Ref, Alt) %>%
dplyr::filter(Type == "SNV") %>%
count() # 112 SNVs at position 1,169
## # A tibble: 3 × 3
## # Groups: Ref, Alt [3]
## Ref Alt n
## <chr> <chr> <int>
## 1 C A 12
## 2 C G 2
## 3 C T 98
## Summary of data
vtree(snvs_ins_del_clean,
title = "Total mutations",
# imageFileOnly=TRUE, # to output a PNG
"Technology Type Consequence",
prunebelow = list(Type = c("Insertion", "Deletion"))
)
vtree(snvs_ins_del_clean,
vars = " Exposure codon_position "
)
vtree(snvs_ins_del_clean,
title = "Total mutations",
# imageFileOnly=TRUE, # to output a PNG
"Exposure Study"
)
vtree(snvs_ins_del_clean,
title = "Total mutations",
# imageFileOnly=TRUE, # to output a PNG
"Exposure"
)
vtree(snvs_ins_del_clean,
title = "Total mutations",
# imageFileOnly=TRUE, # to output a PNG
"Exposure Technology"
)
vtree(snvs_ins_del_clean,
title = "Total mutations",
# imageFileOnly=TRUE, # to output a PNG
"Exposure Tissue"
)
vtree(snvs_ins_del_clean %>% dplyr::filter(!is.na(Consequence)),
title = "Total mutations",
# imageFileOnly=TRUE, # to output a PNG
"Consequence"
)
vtree(snvs_ins_del_clean,
title = "Total mutations",
# imageFileOnly=TRUE, # to output a PNG
"Technology Consequence"
)
vtree(snvs_ins_del_clean,
title = "Total mutations",
# imageFileOnly=TRUE, # to output a PNG
"Consequence Technology"
)
codon_composition_tech <- snvs_ins_del_clean %>%
dplyr::filter(Consequence == "missense") %>%
dplyr::select(Codon, Technology) %>%
dplyr::group_by(Codon) %>%
dplyr::distinct() %>%
mutate(Technology = paste0(Technology, collapse = ", ")) %>%
dplyr::distinct()
codon_composition_exp <- snvs_ins_del_clean %>%
dplyr::filter(Consequence == "missense") %>%
dplyr::select(Exposure, Codon, Technology) %>%
dplyr::group_by(Codon) %>%
dplyr::distinct()
vtree(codon_composition_tech,
title = "Codons",
# imageFileOnly = TRUE, # to output a PNG
"Technology"
)
vtree(codon_composition_exp,
title = "Codons",
# imageFileOnly = TRUE, # to output a PNG
"Exposure Technology",
pattern = T
)
vtree(snvs_ins_del_clean,
title = "Total mutations",
# imageFileOnly=TRUE, # to output a PNG
"Type Technology"
)
exp_codons <- snvs_ins_del_clean %>%
dplyr::group_by(Codon) %>%
dplyr::filter(!Exposure == "Control") %>%
pull(Codon)
exp_codons_ungrouped <- snvs_ins_del_clean %>%
dplyr::filter(!Exposure == "Control") %>%
pull(Codon)
controls_only <- snvs_ins_del_clean %>%
dplyr::filter(!Codon %in% exp_codons)
# This calculation ONLY accounts for unique codons
controls_only %>%
dplyr::group_by(Codon) %>%
tally() %>%
pull(Codon)
## [1] 25 55 77 112 164 166 169 176 183 187 190 294 323 360 362
## [16] 366 379 395 407 440 448 492 515 535 625 648 651 653 673 684
## [31] 693 733 770 782 796 800 811 832 843 846 872 921 925 1008 1022
# This calculation is for everything, like above
control_only_aa_all <- snvs_ins_del_clean %>%
dplyr::filter(Exposure == "Control") %>%
dplyr::select(Codon) %>%
pull()
non_control_aa_all <- snvs_ins_del_clean %>%
dplyr::filter(!Exposure == "Control") %>%
dplyr::select(Codon) %>%
pull()
control_only_aa_all[!control_only_aa_all %in% non_control_aa_all] %>% unique()
## [1] 55 77 323 379 440 535 673 832 625 684 693 843 1008 846 651
## [16] 770 921 112 166 782 190 360 169 800 811 362 872 648 796 448
## [31] 294 366 395 492 515 733 407 925 176 183 187 653 164 1022 25
vtree(controls_only,
title = "Control-only mutations",
"Tissue"
)
controls_only_ranked <- controls_only %>%
dplyr::group_by(Codon) %>%
tally() %>%
arrange(-n)
controls_only_ranked$Codon <- factor(controls_only_ranked$Codon,
levels = controls_only_ranked$Codon
)
ggplot(controls_only_ranked, aes(x = Codon, y = n, group = Codon)) +
geom_bar(stat = "identity") +
theme(axis.text.x = element_text(angle = 90))
write.table(snvs_ins_del_clean,
file = "../data/processed/summary.txt",
quote = F, sep = "\t", row.names = F
)
write.table(controls_only,
file = "../data/processed/controls_only.txt",
quote = F, sep = "\t", row.names = F
)
breakdown_types <-
snvs_ins_del_clean %>%
mutate(Effect = case_when(
Deletion == 1 ~ "Indel",
Insertion == 1 ~ "Indel",
Ref == "INS" ~ "Indel",
Alt == "DEL" ~ "Indel",
str_detect(Alt, regex("del", ignore_case = TRUE)) ~ "Indel",
str_length(Ref) < str_length(Alt) ~ "Indel",
str_length(Ref) > str_length(Alt) ~ "Indel",
Consequence == "frameshift" ~ "Indel",
Consequence == "nonsense" ~ "Nonsense",
Consequence == "silent" ~ "Silent",
Consequence == "missense" ~ "Missense",
Consequence == "stop lost" ~ "Stop Lost",
Consequence == "complex" ~ "Other",
str_length(Ref) > 1 & str_length(Ref) == str_length(Alt) ~ "Other",
!str_detect(Alt, regex("del", ignore_case = TRUE)) ~ "SNV"
))
vtree(breakdown_types,
title = "Total mutations",
# imageFileOnly = T,
"Effect Technology"
)
breakdown_types %>%
dplyr::rename("Total" = Effect) %>%
vtree(
title = "Total mutations",
# imageFileOnly = T,
"Total",
arrowhead = "none",
edgeattr = "style=invis",
showroot = F
)
vtree(breakdown_types,
title = "Total mutations",
# imageFileOnly = T,
"Type Consequence Technology",
keep = list(Consequence = c("missense", "nonsense", "silent", "stop lost")),
vp = F
)
# Join with table on structure predictions
pdb <- read.pdb("1JZ7")
## Note: Accessing on-line PDB file
## PDB has ALT records, taking A only, rm.alt=TRUE
lacz_dssp <- bio3d::dssp(pdb)
ss_df <- data.frame(
name = names(lacz_dssp$sse),
ss_code = unlist(lacz_dssp$sse),
solvent_accessibility = unlist(lacz_dssp$acc)
) %>%
tidyr::separate(name, c("CodonRef", "Chain", "C"), "_", convert = T) %>%
dplyr::select(-C) %>%
dplyr::filter(Chain == "A")
str_code <- c(
"H" = "α-helix",
"B" = "β-bridge",
"E" = "β-strand",
"G" = "310 Helix",
"I" = "Ï€-helix",
"T" = "Turn",
"S" = "Bend"
)
ss_df <- ss_df %>%
mutate(DSSP = str_code[ss_code])
structure_1 <- read.table(
"data/raw/V00296.1_E._coli_gene_lacZ_coding_for_beta-galactosidase__EC_3.2.1.23_.netsurfp.clean.txt",
header = T, sep = "\t", row.names=NULL)
structure_analysis <- left_join(
snvs_ins_del_clean %>%
dplyr::filter(!is.na(residue_code) & !is.na(alt_code)),
structure_1,
by = c("CodonRef" = "Codon")
)
df <- structure_analysis # temp shorter name
structure_analysis$secondary_structure <- ifelse(
df$"Probability.for.Alpha.Helix" >= df$"Probability.for.Beta.strand" &
df$"Probability.for.Alpha.Helix" >= df$"Probability.for.Coil", "Alpha Helix",
ifelse(df$"Probability.for.Beta.strand" >= df$"Probability.for.Alpha.Helix" &
df$"Probability.for.Beta.strand" >= df$"Probability.for.Coil", "Beta Strand", "Coil")
)
grantham_distances <- grantham_distance(x = structure_analysis$residue_code, y = structure_analysis$alt_code)
structure_analysis$grantham_distance <- grantham_distances$d
conservative_threshold <- 50
structure_analysis$conservative <- structure_analysis$grantham_distance <= conservative_threshold
is_within_range <- function(start, end, position) {
any(position >= start & position <= end)
}
structure_analysis <- structure_analysis %>%
rowwise() %>%
mutate(
pdb_ss = case_when(
is_within_range(pdb$sheet$start, pdb$sheet$end, CodonRef) ~ "Sheet",
is_within_range(pdb$helix$start, pdb$helix$end, CodonRef) ~ "Helix",
TRUE ~ "NA"
)
)
structure_analysis$pdb_ss <- factor(structure_analysis$pdb_ss, levels = c("NA", "Helix", "Sheet"))
structure_analysis <- structure_analysis %>%
left_join(ss_df)
accessibility_threshold <- 50
structure_analysis$solvent_accessible <- structure_analysis$solvent_accessibility >= accessibility_threshold
structure_analysis <- structure_analysis %>%
mutate_at(c("DSSP"), ~ replace_na(., "Coil"))
structure_analysis$DSSP <- factor(structure_analysis$DSSP,
levels = c("Coil", "α-helix", "Bend", "β-bridge", "310 Helix", "π-helix", "β-strand", "Turn")
)
structure_analysis_singletons <- structure_analysis %>%
dplyr::select(
Position, `Ref A.A.`, `Alt A.A.`, Consequence, Type,
Codon, aa_change, PositionRef, residue_code, alt_code,
CodonRef, residue_name, Domain, Buried.or.Exposed,
NetSurf.Amino.Acid, Probability.for.Alpha.Helix,
Probability.for.Beta.strand, Probability.for.Coil,
secondary_structure, grantham_distance, conservative,
pdb_ss, Chain, ss_code, solvent_accessibility, solvent_accessible,
DSSP
) %>%
distinct() # Not Ref,Alt,mutation because some duplicates don't produce different amino acid changes.
# structure_analysis.all_cols <- structure_analysis
# Limit to the unique missense SNVs
structure_analysis <- structure_analysis_singletons %>%
filter(Consequence == "missense") %>%
filter(Type == "SNV")
# Test whether domain is related to degree of conservation
structure_analysis$Domain[is.na(structure_analysis$Domain)] <- "None"
structure_analysis$Domain <- factor(structure_analysis$Domain)
contingency_table_domain <- table(structure_analysis$Domain, structure_analysis$conservative)
chi_squared_result_domain <- chisq.test(contingency_table_domain)
print(chi_squared_result_domain)
##
## Pearson's Chi-squared test
##
## data: contingency_table_domain
## X-squared = 7.5554, df = 4, p-value = 0.1093
contingency_table_domain %>%
as.data.frame() %>%
pivot_wider(names_from = Var2, values_from = Freq) %>%
mutate(pct_false = round(100 * `FALSE` / (sum(`FALSE`)), digits = 1)) %>%
mutate(pct_true = round(100 * `TRUE` / (sum(`TRUE`)), digits = 1))
## # A tibble: 5 × 5
## Var1 `FALSE` `TRUE` pct_false pct_true
## <fct> <int> <int> <dbl> <dbl>
## 1 None 46 21 7 10
## 2 Sugar Binding (PF02837) 120 33 18.4 15.8
## 3 TIM Barrel (PF02836) 264 100 40.4 47.8
## 4 β-Galactosidase (PF00703) 56 13 8.6 6.2
## 5 β-Galactosidase Small Chain (PF02929) 167 42 25.6 20.1
# Test whether type of secondary structure is related to degree of conservation
contingency_table_ss <- table(structure_analysis$DSSP, structure_analysis$conservative)
chi_squared_result_str <- chisq.test(contingency_table_ss)
print(chi_squared_result_str)
##
## Pearson's Chi-squared test
##
## data: contingency_table_ss
## X-squared = 9.8764, df = 7, p-value = 0.1957
# Test whether type of secondary structure is related to degree of conservation
structure_analysis$solvent_accessible <- relevel(factor(structure_analysis$solvent_accessible), ref = "TRUE")
contingency_table <- table(structure_analysis$DSSP, structure_analysis$solvent_accessible)
chi_squared_result_str <- chisq.test(contingency_table)
print(chi_squared_result_str)
##
## Pearson's Chi-squared test
##
## data: contingency_table
## X-squared = 11.364, df = 7, p-value = 0.1235
# Fit regression model
model_ss <- glm(grantham_distance ~ DSSP, data = structure_analysis, family = quasipoisson)
summary(model_ss)
##
## Call:
## glm(formula = grantham_distance ~ DSSP, family = quasipoisson,
## data = structure_analysis)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.42376 0.03673 120.442 < 2e-16 ***
## DSSPα-helix 0.04156 0.07119 0.584 0.5595
## DSSPBend 0.09851 0.06204 1.588 0.1127
## DSSPβ-bridge 0.19629 0.10167 1.931 0.0538 .
## DSSP310 Helix 0.03638 0.12694 0.287 0.7745
## DSSPÏ€-helix 0.47308 0.11862 3.988 7.22e-05 ***
## DSSPβ-strand 0.09043 0.04835 1.870 0.0618 .
## DSSPTurn 0.05185 0.06547 0.792 0.4286
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 25.54273)
##
## Null deviance: 23402 on 861 degrees of freedom
## Residual deviance: 22948 on 854 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 5
ggplot(structure_analysis, aes(x = DSSP, y = grantham_distance)) +
geom_violin(alpha = 0.5) +
geom_jitter(position = position_jitter(seed = 1, width = 0.2), alpha = 0.5) +
theme_bw() +
xlab("Secondary structure class") +
ylab("Grantham distance")
# Fit regression model
model_ss_binomial <- glm(conservative ~ DSSP, data = structure_analysis, family = binomial)
summary(model_ss_binomial)
##
## Call:
## glm(formula = conservative ~ DSSP, family = binomial, data = structure_analysis)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.14016 0.15491 -7.360 1.83e-13 ***
## DSSPα-helix 0.25030 0.29212 0.857 0.392
## DSSPBend 0.27996 0.25907 1.081 0.280
## DSSPβ-bridge -0.98010 0.63034 -1.555 0.120
## DSSP310 Helix 0.52112 0.49374 1.055 0.291
## DSSPÏ€-helix -1.49890 1.04662 -1.432 0.152
## DSSPβ-strand -0.09003 0.21021 -0.428 0.668
## DSSPTurn -0.05508 0.28396 -0.194 0.846
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 954.92 on 861 degrees of freedom
## Residual deviance: 944.00 on 854 degrees of freedom
## AIC: 960
##
## Number of Fisher Scoring iterations: 5
# Fit regression model
model_sa <- glm(conservative ~ solvent_accessible, data = structure_analysis, family = binomial)
summary(model_sa)
##
## Call:
## glm(formula = conservative ~ solvent_accessible, family = binomial,
## data = structure_analysis)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.7066 0.2009 -3.517 0.000436 ***
## solvent_accessibleFALSE -0.5204 0.2192 -2.374 0.017614 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 940.79 on 852 degrees of freedom
## Residual deviance: 935.40 on 851 degrees of freedom
## (9 observations deleted due to missingness)
## AIC: 939.4
##
## Number of Fisher Scoring iterations: 4
# Fit regression model
model_sa_continuous <- glm(grantham_distance ~ solvent_accessibility, data = structure_analysis %>% na.omit(), family = quasipoisson)
summary(model_sa_continuous)
##
## Call:
## glm(formula = grantham_distance ~ solvent_accessibility, family = quasipoisson,
## data = structure_analysis %>% na.omit())
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.5341634 0.0211419 214.46 < 2e-16 ***
## solvent_accessibility -0.0021359 0.0006337 -3.37 0.000785 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 25.47378)
##
## Null deviance: 23056 on 852 degrees of freedom
## Residual deviance: 22752 on 851 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 5
ggplot(structure_analysis %>% na.omit(), aes(x = solvent_accessibility, y = log(grantham_distance))) +
theme_bw() +
geom_point(position = position_jitter(seed = 1, width = 0.2)) +
xlab("Solvent accessibility") +
ylab("Grantham distance") +
geom_smooth(method = "glm", method.args = list(family = "quasipoisson"))
# Fit regression model
model_ss_sa <- glm(conservative ~ DSSP * solvent_accessibility, data = structure_analysis, family = binomial)
# Fit regression model. Unclear what distribution would be best applied here.
# model_ss_sa <- glm(grantham_distance ~ DSSP * solvent_accessibility, data = structure_analysis, family = quasipoisson)
summary(model_ss_sa)
##
## Call:
## glm(formula = conservative ~ DSSP * solvent_accessibility, family = binomial,
## data = structure_analysis)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.338e+00 1.865e-01 -7.173 7.34e-13 ***
## DSSPα-helix 3.811e-01 3.451e-01 1.104 0.2694
## DSSPBend 2.634e-01 3.298e-01 0.799 0.4244
## DSSPβ-bridge -4.975e-01 7.345e-01 -0.677 0.4982
## DSSP310 Helix 5.611e-01 6.033e-01 0.930 0.3523
## DSSPÏ€-helix -1.070e+00 1.078e+00 -0.992 0.3210
## DSSPβ-strand 4.014e-02 2.495e-01 0.161 0.8722
## DSSPTurn -2.380e-01 3.707e-01 -0.642 0.5209
## solvent_accessibility 9.557e-03 5.438e-03 1.757 0.0789 .
## DSSPα-helix:solvent_accessibility -6.325e-03 8.916e-03 -0.709 0.4781
## DSSPBend:solvent_accessibility -2.622e-05 9.156e-03 -0.003 0.9977
## DSSPβ-bridge:solvent_accessibility -6.168e-02 1.018e-01 -0.606 0.5446
## DSSP310 Helix:solvent_accessibility -3.413e-03 1.346e-02 -0.254 0.7998
## DSSPÏ€-helix:solvent_accessibility -4.185e-02 9.609e-02 -0.436 0.6632
## DSSPβ-strand:solvent_accessibility -5.775e-03 7.081e-03 -0.815 0.4148
## DSSPTurn:solvent_accessibility 1.331e-03 7.720e-03 0.172 0.8631
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 940.79 on 852 degrees of freedom
## Residual deviance: 919.26 on 837 degrees of freedom
## (9 observations deleted due to missingness)
## AIC: 951.26
##
## Number of Fisher Scoring iterations: 7
plot(model_ss_sa)
results <- as.data.frame(summary(model_ss_sa)$coefficient)
results$Odds_ratio <- exp(results[["Estimate"]])
results
## Estimate Std. Error z value
## (Intercept) -1.337865412 0.186516843 -7.172893287
## DSSPα-helix 0.381093277 0.345087046 1.104339562
## DSSPBend 0.263424774 0.329763916 0.798828376
## DSSPβ-bridge -0.497487412 0.734489551 -0.677324015
## DSSP310 Helix 0.561128486 0.603280956 0.930127962
## DSSPÏ€-helix -1.070111107 1.078327596 -0.992380341
## DSSPβ-strand 0.040139152 0.249487652 0.160886329
## DSSPTurn -0.237994416 0.370742288 -0.641940301
## solvent_accessibility 0.009556906 0.005438413 1.757296937
## DSSPα-helix:solvent_accessibility -0.006324964 0.008916330 -0.709368478
## DSSPBend:solvent_accessibility -0.000026225 0.009156295 -0.002864149
## DSSPβ-bridge:solvent_accessibility -0.061678395 0.101796033 -0.605901752
## DSSP310 Helix:solvent_accessibility -0.003412632 0.013459532 -0.253547583
## DSSPÏ€-helix:solvent_accessibility -0.041849137 0.096092003 -0.435511137
## DSSPβ-strand:solvent_accessibility -0.005774760 0.007081287 -0.815495851
## DSSPTurn:solvent_accessibility 0.001330821 0.007720211 0.172381469
## Pr(>|z|) Odds_ratio
## (Intercept) 7.342909e-13 0.2624052
## DSSPα-helix 2.694459e-01 1.4638841
## DSSPBend 4.243899e-01 1.3013794
## DSSPβ-bridge 4.982004e-01 0.6080565
## DSSP310 Helix 3.523048e-01 1.7526492
## DSSPÏ€-helix 3.210120e-01 0.3429704
## DSSPβ-strand 8.721829e-01 1.0409556
## DSSPTurn 5.209119e-01 0.7882071
## solvent_accessibility 7.886721e-02 1.0096027
## DSSPα-helix:solvent_accessibility 4.780958e-01 0.9936950
## DSSPBend:solvent_accessibility 9.977147e-01 0.9999738
## DSSPβ-bridge:solvent_accessibility 5.445800e-01 0.9401852
## DSSP310 Helix:solvent_accessibility 7.998451e-01 0.9965932
## DSSPÏ€-helix:solvent_accessibility 6.631915e-01 0.9590144
## DSSPβ-strand:solvent_accessibility 4.147885e-01 0.9942419
## DSSPTurn:solvent_accessibility 8.631376e-01 1.0013317
confint(model_ss_sa)
## 2.5 % 97.5 %
## (Intercept) -1.715969795 -0.982915481
## DSSPα-helix -0.309609403 1.048868330
## DSSPBend -0.394347031 0.902794366
## DSSPβ-bridge -2.111682416 0.823127064
## DSSP310 Helix -0.708796657 1.704176990
## DSSPÏ€-helix -4.046420462 0.643090646
## DSSPβ-strand -0.447366645 0.532671991
## DSSPTurn -0.990979627 0.470036047
## solvent_accessibility -0.001422713 0.020225226
## DSSPα-helix:solvent_accessibility -0.024476540 0.011115053
## DSSPBend:solvent_accessibility -0.018109621 0.018044893
## DSSPβ-bridge:solvent_accessibility -0.406244214 0.017826087
## DSSP310 Helix:solvent_accessibility -0.031128675 0.024129809
## DSSPÏ€-helix:solvent_accessibility NA 0.031706633
## DSSPβ-strand:solvent_accessibility -0.019806659 0.008172705
## DSSPTurn:solvent_accessibility -0.013777643 0.016767738
rownames(results) <- rownames(results) %>% str_replace("DSSP", "")
rownames(results)[1] <- "Coil"
results$secondary_structure <- rownames(results) %>% str_replace(":solvent_accessibleTRUE", "")
results$secondary_structure[9] <- "Coil"
results$solvent_accessible <- NA
results$solvent_accessible[1:8] <- FALSE
results$solvent_accessible[9:16] <- TRUE
# results <- results %>% dplyr::arrange(solvent_accessible, Odds_ratio)
kableExtra::kbl(results %>% relocate(secondary_structure, solvent_accessible), row.names = F, digits = 2)
| secondary_structure | solvent_accessible | Estimate | Std. Error | z value | Pr(>|z|) | Odds_ratio |
|---|---|---|---|---|---|---|
| Coil | FALSE | -1.34 | 0.19 | -7.17 | 0.00 | 0.26 |
| α-helix | FALSE | 0.38 | 0.35 | 1.10 | 0.27 | 1.46 |
| Bend | FALSE | 0.26 | 0.33 | 0.80 | 0.42 | 1.30 |
| β-bridge | FALSE | -0.50 | 0.73 | -0.68 | 0.50 | 0.61 |
| 310 Helix | FALSE | 0.56 | 0.60 | 0.93 | 0.35 | 1.75 |
| π-helix | FALSE | -1.07 | 1.08 | -0.99 | 0.32 | 0.34 |
| β-strand | FALSE | 0.04 | 0.25 | 0.16 | 0.87 | 1.04 |
| Turn | FALSE | -0.24 | 0.37 | -0.64 | 0.52 | 0.79 |
| Coil | TRUE | 0.01 | 0.01 | 1.76 | 0.08 | 1.01 |
| α-helix:solvent_accessibility | TRUE | -0.01 | 0.01 | -0.71 | 0.48 | 0.99 |
| Bend:solvent_accessibility | TRUE | 0.00 | 0.01 | 0.00 | 1.00 | 1.00 |
| β-bridge:solvent_accessibility | TRUE | -0.06 | 0.10 | -0.61 | 0.54 | 0.94 |
| 310 Helix:solvent_accessibility | TRUE | 0.00 | 0.01 | -0.25 | 0.80 | 1.00 |
| π-helix:solvent_accessibility | TRUE | -0.04 | 0.10 | -0.44 | 0.66 | 0.96 |
| β-strand:solvent_accessibility | TRUE | -0.01 | 0.01 | -0.82 | 0.41 | 0.99 |
| Turn:solvent_accessibility | TRUE | 0.00 | 0.01 | 0.17 | 0.86 | 1.00 |
flextable(results) %>%
colformat_double()
Estimate | Std. Error | z value | Pr(>|z|) | Odds_ratio | secondary_structure | solvent_accessible |
|---|---|---|---|---|---|---|
-1.3 | 0.2 | -7.2 | 0.0 | 0.3 | Coil | FALSE |
0.4 | 0.3 | 1.1 | 0.3 | 1.5 | α-helix | FALSE |
0.3 | 0.3 | 0.8 | 0.4 | 1.3 | Bend | FALSE |
-0.5 | 0.7 | -0.7 | 0.5 | 0.6 | β-bridge | FALSE |
0.6 | 0.6 | 0.9 | 0.4 | 1.8 | 310 Helix | FALSE |
-1.1 | 1.1 | -1.0 | 0.3 | 0.3 | π-helix | FALSE |
0.0 | 0.2 | 0.2 | 0.9 | 1.0 | β-strand | FALSE |
-0.2 | 0.4 | -0.6 | 0.5 | 0.8 | Turn | FALSE |
0.0 | 0.0 | 1.8 | 0.1 | 1.0 | Coil | TRUE |
-0.0 | 0.0 | -0.7 | 0.5 | 1.0 | α-helix:solvent_accessibility | TRUE |
-0.0 | 0.0 | -0.0 | 1.0 | 1.0 | Bend:solvent_accessibility | TRUE |
-0.1 | 0.1 | -0.6 | 0.5 | 0.9 | β-bridge:solvent_accessibility | TRUE |
-0.0 | 0.0 | -0.3 | 0.8 | 1.0 | 310 Helix:solvent_accessibility | TRUE |
-0.0 | 0.1 | -0.4 | 0.7 | 1.0 | π-helix:solvent_accessibility | TRUE |
-0.0 | 0.0 | -0.8 | 0.4 | 1.0 | β-strand:solvent_accessibility | TRUE |
0.0 | 0.0 | 0.2 | 0.9 | 1.0 | Turn:solvent_accessibility | TRUE |
# Want to know whether we find different substitutions in secondary structure
structure_analysis_specific <- structure_analysis %>%
dplyr::mutate(is_ss = ifelse(DSSP == "Coil", F, T)) %>%
dplyr::mutate(aa_change, ifelse(`Ref A.A.` == `Alt A.A.`, aa_change, "None"))
structure_analysis_specific$aa_change <- factor(structure_analysis_specific$aa_change)
changes_and_ss <- table(
structure_analysis_specific$aa_change,
structure_analysis_specific$is_ss
)
chi_squared_result_ss <- chisq.test(changes_and_ss)
print(chi_squared_result_ss)
##
## Pearson's Chi-squared test
##
## data: changes_and_ss
## X-squared = 165.52, df = 132, p-value = 0.02556
# Fit regression model for whether certain amino acid changes are specific to secondary structure
model <- glm(is_ss ~ aa_change, data = structure_analysis_specific, family = binomial)
# Get summary of the model
summary(model)
##
## Call:
## glm(formula = is_ss ~ aa_change, family = binomial, data = structure_analysis_specific)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.857e+01 2.663e+03 0.007 0.994
## aa_changeA>E -1.857e+01 2.663e+03 -0.007 0.994
## aa_changeA>G -1.828e+01 2.663e+03 -0.007 0.995
## aa_changeA>P -1.801e+01 2.663e+03 -0.007 0.995
## aa_changeA>S -1.759e+01 2.663e+03 -0.007 0.995
## aa_changeA>T -1.765e+01 2.663e+03 -0.007 0.995
## aa_changeA>V -1.696e+01 2.663e+03 -0.006 0.995
## aa_changeC>F 3.101e-07 4.612e+03 0.000 1.000
## aa_changeC>R -1.787e+01 2.663e+03 -0.007 0.995
## aa_changeC>S 3.131e-07 5.326e+03 0.000 1.000
## aa_changeC>W 3.098e-07 4.612e+03 0.000 1.000
## aa_changeC>Y 3.100e-07 4.612e+03 0.000 1.000
## aa_changeD>A -1.857e+01 2.663e+03 -0.007 0.994
## aa_changeD>E -1.706e+01 2.663e+03 -0.006 0.995
## aa_changeD>G 3.098e-07 4.210e+03 0.000 1.000
## aa_changeD>H -1.816e+01 2.663e+03 -0.007 0.995
## aa_changeD>N -1.754e+01 2.663e+03 -0.007 0.995
## aa_changeD>V -1.662e+01 2.663e+03 -0.006 0.995
## aa_changeD>Y -1.718e+01 2.663e+03 -0.006 0.995
## aa_changeE>A -1.787e+01 2.663e+03 -0.007 0.995
## aa_changeE>D -1.662e+01 2.663e+03 -0.006 0.995
## aa_changeE>G -1.662e+01 2.663e+03 -0.006 0.995
## aa_changeE>K -1.703e+01 2.663e+03 -0.006 0.995
## aa_changeE>Q -1.747e+01 2.663e+03 -0.007 0.995
## aa_changeE>V -1.731e+01 2.663e+03 -0.007 0.995
## aa_changeF>C -3.713e+01 7.045e+03 -0.005 0.996
## aa_changeF>I 3.132e-07 5.326e+03 0.000 1.000
## aa_changeF>L -1.857e+01 2.663e+03 -0.007 0.994
## aa_changeF>V -1.857e+01 2.663e+03 -0.007 0.994
## aa_changeF>Y 3.093e-07 7.045e+03 0.000 1.000
## aa_changeG>A -1.718e+01 2.663e+03 -0.006 0.995
## aa_changeG>C -1.727e+01 2.663e+03 -0.006 0.995
## aa_changeG>D -1.795e+01 2.663e+03 -0.007 0.995
## aa_changeG>E -1.696e+01 2.663e+03 -0.006 0.995
## aa_changeG>R -1.762e+01 2.663e+03 -0.007 0.995
## aa_changeG>S -1.718e+01 2.663e+03 -0.006 0.995
## aa_changeG>V -1.747e+01 2.663e+03 -0.007 0.995
## aa_changeG>W -1.787e+01 2.663e+03 -0.007 0.995
## aa_changeH>D -1.787e+01 2.663e+03 -0.007 0.995
## aa_changeH>L -1.787e+01 2.663e+03 -0.007 0.995
## aa_changeH>N -1.787e+01 2.663e+03 -0.007 0.995
## aa_changeH>P -1.765e+01 2.663e+03 -0.007 0.995
## aa_changeH>Q -1.806e+01 2.663e+03 -0.007 0.995
## aa_changeH>R -1.885e+01 2.663e+03 -0.007 0.994
## aa_changeH>Y -1.897e+01 2.663e+03 -0.007 0.994
## aa_changeI>F -1.677e+01 2.663e+03 -0.006 0.995
## aa_changeI>M -3.713e+01 7.045e+03 -0.005 0.996
## aa_changeI>N 3.097e-07 3.523e+03 0.000 1.000
## aa_changeI>T 3.093e-07 7.045e+03 0.000 1.000
## aa_changeI>V -1.857e+01 2.663e+03 -0.007 0.994
## aa_changeK>E -1.718e+01 2.663e+03 -0.006 0.995
## aa_changeK>N 3.096e-07 4.612e+03 0.000 1.000
## aa_changeK>Q 3.093e-07 5.326e+03 0.000 1.000
## aa_changeL>F -1.857e+01 2.663e+03 -0.007 0.994
## aa_changeL>H -1.857e+01 2.663e+03 -0.007 0.994
## aa_changeL>I -1.787e+01 2.663e+03 -0.007 0.995
## aa_changeL>M -1.857e+01 2.663e+03 -0.007 0.994
## aa_changeL>P -1.703e+01 2.663e+03 -0.006 0.995
## aa_changeL>Q -1.787e+01 2.663e+03 -0.007 0.995
## aa_changeL>R -1.718e+01 2.663e+03 -0.006 0.995
## aa_changeL>V -1.787e+01 2.663e+03 -0.007 0.995
## aa_changeM>I -1.816e+01 2.663e+03 -0.007 0.995
## aa_changeM>K -1.747e+01 2.663e+03 -0.007 0.995
## aa_changeM>L -1.787e+01 2.663e+03 -0.007 0.995
## aa_changeM>R -1.816e+01 2.663e+03 -0.007 0.995
## aa_changeM>T 3.135e-07 7.045e+03 0.000 1.000
## aa_changeM>V 3.093e-07 5.326e+03 0.000 1.000
## aa_changeN>D 3.139e-07 4.210e+03 0.000 1.000
## aa_changeN>H -1.857e+01 2.663e+03 -0.007 0.994
## aa_changeN>I -1.772e+01 2.663e+03 -0.007 0.995
## aa_changeN>K -1.765e+01 2.663e+03 -0.007 0.995
## aa_changeN>S 3.093e-07 4.612e+03 0.000 1.000
## aa_changeN>T -1.926e+01 2.663e+03 -0.007 0.994
## aa_changeN>Y 3.135e-07 3.766e+03 0.000 1.000
## aa_changeP>A -1.857e+01 2.663e+03 -0.007 0.994
## aa_changeP>H -1.926e+01 2.663e+03 -0.007 0.994
## aa_changeP>L -1.841e+01 2.663e+03 -0.007 0.994
## aa_changeP>Q -1.977e+01 2.663e+03 -0.007 0.994
## aa_changeP>R -1.948e+01 2.663e+03 -0.007 0.994
## aa_changeP>S -1.828e+01 2.663e+03 -0.007 0.995
## aa_changeP>T -1.838e+01 2.663e+03 -0.007 0.994
## aa_changeQ>E 3.130e-07 4.210e+03 0.000 1.000
## aa_changeQ>H -1.696e+01 2.663e+03 -0.006 0.995
## aa_changeQ>K -1.857e+01 2.663e+03 -0.007 0.994
## aa_changeQ>L -3.713e+01 4.612e+03 -0.008 0.994
## aa_changeQ>P -1.718e+01 2.663e+03 -0.006 0.995
## aa_changeQ>R -1.718e+01 2.663e+03 -0.006 0.995
## aa_changeR>C -1.718e+01 2.663e+03 -0.006 0.995
## aa_changeR>G -1.747e+01 2.663e+03 -0.007 0.995
## aa_changeR>H -1.696e+01 2.663e+03 -0.006 0.995
## aa_changeR>L -1.739e+01 2.663e+03 -0.007 0.995
## aa_changeR>P -1.696e+01 2.663e+03 -0.006 0.995
## aa_changeR>Q 3.132e-07 5.326e+03 0.000 1.000
## aa_changeR>S -1.823e+01 2.663e+03 -0.007 0.995
## aa_changeR>W 3.137e-07 7.045e+03 0.000 1.000
## aa_changeS>C -1.816e+01 2.663e+03 -0.007 0.995
## aa_changeS>F 3.094e-07 7.045e+03 0.000 1.000
## aa_changeS>G -1.926e+01 2.663e+03 -0.007 0.994
## aa_changeS>I -1.926e+01 2.663e+03 -0.007 0.994
## aa_changeS>L -1.696e+01 2.663e+03 -0.006 0.995
## aa_changeS>N -1.857e+01 2.663e+03 -0.007 0.994
## aa_changeS>P -1.857e+01 2.663e+03 -0.007 0.994
## aa_changeS>R -1.875e+01 2.663e+03 -0.007 0.994
## aa_changeS>T -3.713e+01 7.045e+03 -0.005 0.996
## aa_changeS>W 3.093e-07 4.612e+03 0.000 1.000
## aa_changeS>Y 3.139e-07 7.045e+03 0.000 1.000
## aa_changeT>A -1.747e+01 2.663e+03 -0.007 0.995
## aa_changeT>I 3.140e-07 5.326e+03 0.000 1.000
## aa_changeT>K 3.138e-07 4.210e+03 0.000 1.000
## aa_changeT>M 3.138e-07 4.612e+03 0.000 1.000
## aa_changeT>N -3.713e+01 7.045e+03 -0.005 0.996
## aa_changeT>P -1.828e+01 2.663e+03 -0.007 0.995
## aa_changeT>R 3.132e-07 4.612e+03 0.000 1.000
## aa_changeT>S -1.747e+01 2.663e+03 -0.007 0.995
## aa_changeV>A 3.100e-07 3.950e+03 0.000 1.000
## aa_changeV>D 3.094e-07 3.629e+03 0.000 1.000
## aa_changeV>E 3.093e-07 4.612e+03 0.000 1.000
## aa_changeV>F 3.140e-07 3.766e+03 0.000 1.000
## aa_changeV>G 3.094e-07 5.326e+03 0.000 1.000
## aa_changeV>I -1.718e+01 2.663e+03 -0.006 0.995
## aa_changeV>L 3.100e-07 3.523e+03 0.000 1.000
## aa_changeV>M 3.140e-07 4.210e+03 0.000 1.000
## aa_changeW>C 3.135e-07 3.368e+03 0.000 1.000
## aa_changeW>G 3.101e-07 4.612e+03 0.000 1.000
## aa_changeW>L 3.094e-07 3.629e+03 0.000 1.000
## aa_changeW>R 3.131e-07 3.219e+03 0.000 1.000
## aa_changeW>S 3.094e-07 5.326e+03 0.000 1.000
## aa_changeY>C -1.765e+01 2.663e+03 -0.007 0.995
## aa_changeY>D 3.132e-07 4.612e+03 0.000 1.000
## aa_changeY>F 3.094e-07 4.612e+03 0.000 1.000
## aa_changeY>H 3.139e-07 5.326e+03 0.000 1.000
## aa_changeY>N 3.093e-07 4.210e+03 0.000 1.000
## aa_changeY>S -1.857e+01 2.663e+03 -0.007 0.994
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.93 on 861 degrees of freedom
## Residual deviance: 798.20 on 729 degrees of freedom
## AIC: 1064.2
##
## Number of Fisher Scoring iterations: 17
plot(model)
results_specific <- as.data.frame(summary(model)$coefficient)
results_specific$Odds_ratio <- exp(results_specific[["Estimate"]])
results_specific_sig <- results_specific %>%
dplyr::filter(`Pr(>|z|)` < 0.05) %>%
dplyr::arrange(Odds_ratio)
# Fit regression model
# Is the reference AA important for the type of SS?
model <- glm(DSSP ~ aa_change + DSSP:aa_change, data = structure_analysis_specific, family = binomial)
# Get summary of the model
summary(model)
##
## Call:
## glm(formula = DSSP ~ aa_change + DSSP:aa_change, family = binomial,
## data = structure_analysis_specific)
##
## Coefficients: (637 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.657e+01 2.518e+05 0 1
## aa_changeA>E -5.313e+01 2.980e+05 0 1
## aa_changeA>G -5.313e+01 3.251e+05 0 1
## aa_changeA>P -5.313e+01 3.084e+05 0 1
## aa_changeA>S -5.313e+01 3.251e+05 0 1
## aa_changeA>T -5.313e+01 3.561e+05 0 1
## aa_changeA>V -5.313e+01 4.362e+05 0 1
## aa_changeC>F 4.418e-06 4.362e+05 0 1
## aa_changeC>R -5.313e+01 3.561e+05 0 1
## aa_changeC>S 4.622e-06 3.561e+05 0 1
## aa_changeC>W -1.915e-08 3.561e+05 0 1
## aa_changeC>Y -1.240e-08 4.362e+05 0 1
## aa_changeD>A -5.313e+01 3.251e+05 0 1
## aa_changeD>E -5.313e+01 3.561e+05 0 1
## aa_changeD>G -1.909e-07 4.362e+05 0 1
## aa_changeD>H -5.313e+01 3.561e+05 0 1
## aa_changeD>N -5.313e+01 2.980e+05 0 1
## aa_changeD>V -5.313e+01 4.362e+05 0 1
## aa_changeD>Y -5.313e+01 3.251e+05 0 1
## aa_changeE>A -5.313e+01 4.362e+05 0 1
## aa_changeE>D -5.313e+01 4.362e+05 0 1
## aa_changeE>G -5.313e+01 4.362e+05 0 1
## aa_changeE>K -5.313e+01 3.251e+05 0 1
## aa_changeE>Q -5.313e+01 4.362e+05 0 1
## aa_changeE>V -5.313e+01 3.561e+05 0 1
## aa_changeF>C -5.313e+01 4.362e+05 0 1
## aa_changeF>I 4.633e-06 4.362e+05 0 1
## aa_changeF>L -5.313e+01 3.251e+05 0 1
## aa_changeF>V -5.313e+01 4.362e+05 0 1
## aa_changeF>Y 4.429e-06 4.362e+05 0 1
## aa_changeG>A -5.313e+01 4.362e+05 0 1
## aa_changeG>C -5.313e+01 3.251e+05 0 1
## aa_changeG>D -5.313e+01 2.855e+05 0 1
## aa_changeG>E -5.313e+01 4.362e+05 0 1
## aa_changeG>R -5.313e+01 2.855e+05 0 1
## aa_changeG>S -5.313e+01 3.084e+05 0 1
## aa_changeG>V -5.313e+01 2.815e+05 0 1
## aa_changeG>W -5.313e+01 4.362e+05 0 1
## aa_changeH>D -5.313e+01 4.362e+05 0 1
## aa_changeH>L -5.313e+01 3.561e+05 0 1
## aa_changeH>N -5.313e+01 4.362e+05 0 1
## aa_changeH>P -5.313e+01 3.561e+05 0 1
## aa_changeH>Q -5.313e+01 3.251e+05 0 1
## aa_changeH>R -5.313e+01 3.084e+05 0 1
## aa_changeH>Y -5.313e+01 3.251e+05 0 1
## aa_changeI>F -5.313e+01 4.362e+05 0 1
## aa_changeI>M -5.313e+01 4.362e+05 0 1
## aa_changeI>N 4.625e-06 2.908e+05 0 1
## aa_changeI>T 4.639e-06 4.362e+05 0 1
## aa_changeI>V -5.313e+01 3.561e+05 0 1
## aa_changeK>E -5.313e+01 4.362e+05 0 1
## aa_changeK>N -1.867e-07 4.362e+05 0 1
## aa_changeK>Q 4.631e-06 4.362e+05 0 1
## aa_changeL>F -5.313e+01 4.362e+05 0 1
## aa_changeL>H -5.313e+01 4.362e+05 0 1
## aa_changeL>I -5.313e+01 4.362e+05 0 1
## aa_changeL>M -5.313e+01 3.251e+05 0 1
## aa_changeL>P -5.313e+01 3.251e+05 0 1
## aa_changeL>Q -5.313e+01 3.561e+05 0 1
## aa_changeL>R -5.313e+01 3.561e+05 0 1
## aa_changeL>V -5.313e+01 4.362e+05 0 1
## aa_changeM>I -5.313e+01 3.561e+05 0 1
## aa_changeM>K -5.313e+01 4.362e+05 0 1
## aa_changeM>L -5.313e+01 4.362e+05 0 1
## aa_changeM>R -5.313e+01 3.561e+05 0 1
## aa_changeM>T 4.414e-06 4.362e+05 0 1
## aa_changeM>V -1.792e-07 4.362e+05 0 1
## aa_changeN>D 4.429e-06 4.362e+05 0 1
## aa_changeN>H -5.313e+01 4.362e+05 0 1
## aa_changeN>I -5.313e+01 3.251e+05 0 1
## aa_changeN>K -5.313e+01 3.084e+05 0 1
## aa_changeN>S 4.437e-06 3.561e+05 0 1
## aa_changeN>T -5.313e+01 3.561e+05 0 1
## aa_changeN>Y -1.292e-08 4.362e+05 0 1
## aa_changeP>A -5.313e+01 3.561e+05 0 1
## aa_changeP>H -5.313e+01 3.561e+05 0 1
## aa_changeP>L -5.313e+01 2.908e+05 0 1
## aa_changeP>Q -5.313e+01 2.759e+05 0 1
## aa_changeP>R -5.313e+01 2.980e+05 0 1
## aa_changeP>S -5.313e+01 3.251e+05 0 1
## aa_changeP>T -5.313e+01 2.980e+05 0 1
## aa_changeQ>E 4.456e-06 3.561e+05 0 1
## aa_changeQ>H -5.313e+01 3.561e+05 0 1
## aa_changeQ>K -5.313e+01 2.980e+05 0 1
## aa_changeQ>L -5.313e+01 3.251e+05 0 1
## aa_changeQ>P -5.313e+01 4.362e+05 0 1
## aa_changeQ>R -5.313e+01 4.362e+05 0 1
## aa_changeR>C -5.313e+01 3.251e+05 0 1
## aa_changeR>G -5.313e+01 3.561e+05 0 1
## aa_changeR>H -5.313e+01 3.561e+05 0 1
## aa_changeR>L -5.313e+01 3.084e+05 0 1
## aa_changeR>P -5.313e+01 3.251e+05 0 1
## aa_changeR>Q 4.642e-06 3.561e+05 0 1
## aa_changeR>S -5.313e+01 2.980e+05 0 1
## aa_changeR>W -1.403e-08 4.362e+05 0 1
## aa_changeS>C -5.313e+01 3.561e+05 0 1
## aa_changeS>F 4.438e-06 4.362e+05 0 1
## aa_changeS>G -5.313e+01 3.561e+05 0 1
## aa_changeS>I -5.313e+01 3.561e+05 0 1
## aa_changeS>L -5.313e+01 4.362e+05 0 1
## aa_changeS>N -5.313e+01 3.561e+05 0 1
## aa_changeS>P -5.313e+01 3.084e+05 0 1
## aa_changeS>R -5.313e+01 2.908e+05 0 1
## aa_changeS>T -5.313e+01 4.362e+05 0 1
## aa_changeS>W 4.415e-06 4.362e+05 0 1
## aa_changeS>Y 4.423e-06 4.362e+05 0 1
## aa_changeT>A -5.313e+01 4.362e+05 0 1
## aa_changeT>I -4.102e-09 4.362e+05 0 1
## aa_changeT>K -2.201e-07 3.251e+05 0 1
## aa_changeT>M 4.418e-06 4.362e+05 0 1
## aa_changeT>N -5.313e+01 4.362e+05 0 1
## aa_changeT>P -5.313e+01 3.251e+05 0 1
## aa_changeT>R 4.629e-06 3.251e+05 0 1
## aa_changeT>S -5.313e+01 4.362e+05 0 1
## aa_changeV>A 4.625e-06 2.980e+05 0 1
## aa_changeV>D -1.797e-10 2.855e+05 0 1
## aa_changeV>E -1.991e-07 3.251e+05 0 1
## aa_changeV>F 4.423e-06 2.980e+05 0 1
## aa_changeV>G -1.834e-07 4.362e+05 0 1
## aa_changeV>I -5.313e+01 4.362e+05 0 1
## aa_changeV>L 4.415e-06 2.980e+05 0 1
## aa_changeV>M 3.387e-10 3.561e+05 0 1
## aa_changeW>C 4.434e-06 3.251e+05 0 1
## aa_changeW>G 2.176e-09 4.362e+05 0 1
## aa_changeW>L 1.066e-08 4.362e+05 0 1
## aa_changeW>R 4.417e-06 4.362e+05 0 1
## aa_changeW>S 4.422e-06 4.362e+05 0 1
## aa_changeY>C -5.313e+01 3.561e+05 0 1
## aa_changeY>D 2.042e-09 3.561e+05 0 1
## aa_changeY>F 4.439e-06 3.561e+05 0 1
## aa_changeY>H 7.669e-09 4.362e+05 0 1
## aa_changeY>N 9.123e-09 4.362e+05 0 1
## aa_changeY>S -5.313e+01 4.362e+05 0 1
## DSSPα-helix:aa_changeA>D 4.415e-06 4.362e+05 0 1
## DSSPBend:aa_changeA>D NA NA NA NA
## DSSPβ-bridge:aa_changeA>D NA NA NA NA
## DSSP310 Helix:aa_changeA>D NA NA NA NA
## DSSPÏ€-helix:aa_changeA>D NA NA NA NA
## DSSPβ-strand:aa_changeA>D 4.625e-06 3.251e+05 0 1
## DSSPTurn:aa_changeA>D NA NA NA NA
## DSSPα-helix:aa_changeA>E NA NA NA NA
## DSSPBend:aa_changeA>E 5.313e+01 2.980e+05 0 1
## DSSPβ-bridge:aa_changeA>E NA NA NA NA
## DSSP310 Helix:aa_changeA>E NA NA NA NA
## DSSPÏ€-helix:aa_changeA>E NA NA NA NA
## DSSPβ-strand:aa_changeA>E 5.313e+01 2.601e+05 0 1
## DSSPTurn:aa_changeA>E NA NA NA NA
## DSSPα-helix:aa_changeA>G NA NA NA NA
## DSSPBend:aa_changeA>G 5.313e+01 4.112e+05 0 1
## DSSPβ-bridge:aa_changeA>G NA NA NA NA
## DSSP310 Helix:aa_changeA>G NA NA NA NA
## DSSPÏ€-helix:aa_changeA>G NA NA NA NA
## DSSPβ-strand:aa_changeA>G 5.313e+01 3.251e+05 0 1
## DSSPTurn:aa_changeA>G 5.313e+01 4.112e+05 0 1
## DSSPα-helix:aa_changeA>P NA NA NA NA
## DSSPBend:aa_changeA>P 5.313e+01 2.720e+05 0 1
## DSSPβ-bridge:aa_changeA>P NA NA NA NA
## DSSP310 Helix:aa_changeA>P NA NA NA NA
## DSSPÏ€-helix:aa_changeA>P NA NA NA NA
## DSSPβ-strand:aa_changeA>P 5.313e+01 2.518e+05 0 1
## DSSPTurn:aa_changeA>P NA NA NA NA
## DSSPα-helix:aa_changeA>S NA NA NA NA
## DSSPBend:aa_changeA>S 5.313e+01 2.908e+05 0 1
## DSSPβ-bridge:aa_changeA>S NA NA NA NA
## DSSP310 Helix:aa_changeA>S NA NA NA NA
## DSSPÏ€-helix:aa_changeA>S NA NA NA NA
## DSSPβ-strand:aa_changeA>S 5.313e+01 2.720e+05 0 1
## DSSPTurn:aa_changeA>S 5.313e+01 4.112e+05 0 1
## DSSPα-helix:aa_changeA>T 5.313e+01 4.362e+05 0 1
## DSSPBend:aa_changeA>T NA NA NA NA
## DSSPβ-bridge:aa_changeA>T NA NA NA NA
## DSSP310 Helix:aa_changeA>T NA NA NA NA
## DSSPÏ€-helix:aa_changeA>T NA NA NA NA
## DSSPβ-strand:aa_changeA>T 5.313e+01 3.084e+05 0 1
## DSSPTurn:aa_changeA>T NA NA NA NA
## DSSPα-helix:aa_changeA>V 5.313e+01 4.362e+05 0 1
## DSSPBend:aa_changeA>V 5.313e+01 5.036e+05 0 1
## DSSPβ-bridge:aa_changeA>V NA NA NA NA
## DSSP310 Helix:aa_changeA>V 5.313e+01 5.036e+05 0 1
## DSSPÏ€-helix:aa_changeA>V NA NA NA NA
## DSSPβ-strand:aa_changeA>V 5.313e+01 5.036e+05 0 1
## DSSPTurn:aa_changeA>V NA NA NA NA
## DSSPα-helix:aa_changeC>F NA NA NA NA
## DSSPBend:aa_changeC>F NA NA NA NA
## DSSPβ-bridge:aa_changeC>F NA NA NA NA
## DSSP310 Helix:aa_changeC>F 1.670e-08 5.036e+05 0 1
## DSSPÏ€-helix:aa_changeC>F 2.421e-08 5.036e+05 0 1
## DSSPβ-strand:aa_changeC>F NA NA NA NA
## DSSPTurn:aa_changeC>F NA NA NA NA
## DSSPα-helix:aa_changeC>R NA NA NA NA
## DSSPBend:aa_changeC>R NA NA NA NA
## DSSPβ-bridge:aa_changeC>R NA NA NA NA
## DSSP310 Helix:aa_changeC>R 5.313e+01 4.362e+05 0 1
## DSSPÏ€-helix:aa_changeC>R 5.313e+01 4.362e+05 0 1
## DSSPβ-strand:aa_changeC>R 5.313e+01 3.561e+05 0 1
## DSSPTurn:aa_changeC>R NA NA NA NA
## DSSPα-helix:aa_changeC>S NA NA NA NA
## DSSPBend:aa_changeC>S NA NA NA NA
## DSSPβ-bridge:aa_changeC>S NA NA NA NA
## DSSP310 Helix:aa_changeC>S NA NA NA NA
## DSSPÏ€-helix:aa_changeC>S NA NA NA NA
## DSSPβ-strand:aa_changeC>S NA NA NA NA
## DSSPTurn:aa_changeC>S NA NA NA NA
## DSSPα-helix:aa_changeC>W NA NA NA NA
## DSSPBend:aa_changeC>W NA NA NA NA
## DSSPβ-bridge:aa_changeC>W NA NA NA NA
## DSSP310 Helix:aa_changeC>W NA NA NA NA
## DSSPÏ€-helix:aa_changeC>W 4.439e-06 4.362e+05 0 1
## DSSPβ-strand:aa_changeC>W NA NA NA NA
## DSSPTurn:aa_changeC>W NA NA NA NA
## DSSPα-helix:aa_changeC>Y NA NA NA NA
## DSSPBend:aa_changeC>Y NA NA NA NA
## DSSPβ-bridge:aa_changeC>Y NA NA NA NA
## DSSP310 Helix:aa_changeC>Y -2.153e-07 5.036e+05 0 1
## DSSPÏ€-helix:aa_changeC>Y 4.643e-06 5.036e+05 0 1
## DSSPβ-strand:aa_changeC>Y NA NA NA NA
## DSSPTurn:aa_changeC>Y NA NA NA NA
## DSSPα-helix:aa_changeD>A 5.313e+01 4.112e+05 0 1
## DSSPBend:aa_changeD>A NA NA NA NA
## DSSPβ-bridge:aa_changeD>A NA NA NA NA
## DSSP310 Helix:aa_changeD>A NA NA NA NA
## DSSPÏ€-helix:aa_changeD>A NA NA NA NA
## DSSPβ-strand:aa_changeD>A NA NA NA NA
## DSSPTurn:aa_changeD>A 5.313e+01 3.251e+05 0 1
## DSSPα-helix:aa_changeD>E 5.313e+01 4.362e+05 0 1
## DSSPBend:aa_changeD>E 5.313e+01 3.561e+05 0 1
## DSSPβ-bridge:aa_changeD>E NA NA NA NA
## DSSP310 Helix:aa_changeD>E 5.313e+01 4.362e+05 0 1
## DSSPÏ€-helix:aa_changeD>E NA NA NA NA
## DSSPβ-strand:aa_changeD>E 5.313e+01 3.561e+05 0 1
## DSSPTurn:aa_changeD>E 5.313e+01 3.251e+05 0 1
## DSSPα-helix:aa_changeD>G -2.880e-08 5.036e+05 0 1
## DSSPBend:aa_changeD>G -3.123e-08 4.362e+05 0 1
## DSSPβ-bridge:aa_changeD>G NA NA NA NA
## DSSP310 Helix:aa_changeD>G NA NA NA NA
## DSSPÏ€-helix:aa_changeD>G NA NA NA NA
## DSSPβ-strand:aa_changeD>G NA NA NA NA
## DSSPTurn:aa_changeD>G NA NA NA NA
## DSSPα-helix:aa_changeD>H NA NA NA NA
## DSSPBend:aa_changeD>H 5.313e+01 4.362e+05 0 1
## DSSPβ-bridge:aa_changeD>H NA NA NA NA
## DSSP310 Helix:aa_changeD>H NA NA NA NA
## DSSPÏ€-helix:aa_changeD>H 5.313e+01 4.362e+05 0 1
## DSSPβ-strand:aa_changeD>H NA NA NA NA
## DSSPTurn:aa_changeD>H 5.313e+01 4.362e+05 0 1
## DSSPα-helix:aa_changeD>N 5.313e+01 2.980e+05 0 1
## DSSPBend:aa_changeD>N 5.313e+01 2.252e+05 0 1
## DSSPβ-bridge:aa_changeD>N NA NA NA NA
## DSSP310 Helix:aa_changeD>N 5.313e+01 3.901e+05 0 1
## DSSPÏ€-helix:aa_changeD>N 5.313e+01 3.901e+05 0 1
## DSSPβ-strand:aa_changeD>N 5.313e+01 2.601e+05 0 1
## DSSPTurn:aa_changeD>N 5.313e+01 2.980e+05 0 1
## DSSPα-helix:aa_changeD>V 5.313e+01 4.362e+05 0 1
## DSSPBend:aa_changeD>V 5.313e+01 4.362e+05 0 1
## DSSPβ-bridge:aa_changeD>V NA NA NA NA
## DSSP310 Helix:aa_changeD>V NA NA NA NA
## DSSPÏ€-helix:aa_changeD>V NA NA NA NA
## DSSPβ-strand:aa_changeD>V 5.313e+01 5.036e+05 0 1
## DSSPTurn:aa_changeD>V 5.313e+01 4.362e+05 0 1
## DSSPα-helix:aa_changeD>Y 5.313e+01 2.908e+05 0 1
## DSSPBend:aa_changeD>Y 5.313e+01 3.251e+05 0 1
## DSSPβ-bridge:aa_changeD>Y 5.313e+01 4.112e+05 0 1
## DSSP310 Helix:aa_changeD>Y 5.313e+01 4.112e+05 0 1
## DSSPÏ€-helix:aa_changeD>Y 5.313e+01 4.112e+05 0 1
## DSSPβ-strand:aa_changeD>Y 5.313e+01 2.908e+05 0 1
## DSSPTurn:aa_changeD>Y 5.313e+01 4.112e+05 0 1
## DSSPα-helix:aa_changeE>A NA NA NA NA
## DSSPBend:aa_changeE>A NA NA NA NA
## DSSPβ-bridge:aa_changeE>A 5.313e+01 5.036e+05 0 1
## DSSP310 Helix:aa_changeE>A NA NA NA NA
## DSSPÏ€-helix:aa_changeE>A NA NA NA NA
## DSSPβ-strand:aa_changeE>A 5.313e+01 5.036e+05 0 1
## DSSPTurn:aa_changeE>A NA NA NA NA
## DSSPα-helix:aa_changeE>D 5.313e+01 5.036e+05 0 1
## DSSPBend:aa_changeE>D 5.313e+01 5.036e+05 0 1
## DSSPβ-bridge:aa_changeE>D 5.313e+01 5.036e+05 0 1
## DSSP310 Helix:aa_changeE>D 5.313e+01 5.036e+05 0 1
## DSSPÏ€-helix:aa_changeE>D NA NA NA NA
## DSSPβ-strand:aa_changeE>D 5.313e+01 4.112e+05 0 1
## DSSPTurn:aa_changeE>D NA NA NA NA
## DSSPα-helix:aa_changeE>G NA NA NA NA
## DSSPBend:aa_changeE>G 5.313e+01 5.036e+05 0 1
## DSSPβ-bridge:aa_changeE>G 5.313e+01 4.362e+05 0 1
## DSSP310 Helix:aa_changeE>G NA NA NA NA
## DSSPÏ€-helix:aa_changeE>G NA NA NA NA
## DSSPβ-strand:aa_changeE>G 5.313e+01 4.112e+05 0 1
## DSSPTurn:aa_changeE>G 5.313e+01 5.036e+05 0 1
## DSSPα-helix:aa_changeE>K NA NA NA NA
## DSSPBend:aa_changeE>K 5.313e+01 2.908e+05 0 1
## DSSPβ-bridge:aa_changeE>K 5.313e+01 3.251e+05 0 1
## DSSP310 Helix:aa_changeE>K 5.313e+01 4.112e+05 0 1
## DSSPÏ€-helix:aa_changeE>K NA NA NA NA
## DSSPβ-strand:aa_changeE>K 5.313e+01 2.457e+05 0 1
## DSSPTurn:aa_changeE>K 5.313e+01 4.112e+05 0 1
## DSSPα-helix:aa_changeE>Q NA NA NA NA
## DSSPBend:aa_changeE>Q 5.313e+01 5.036e+05 0 1
## DSSPβ-bridge:aa_changeE>Q NA NA NA NA
## DSSP310 Helix:aa_changeE>Q NA NA NA NA
## DSSPÏ€-helix:aa_changeE>Q NA NA NA NA
## DSSPβ-strand:aa_changeE>Q 5.313e+01 4.362e+05 0 1
## DSSPTurn:aa_changeE>Q NA NA NA NA
## DSSPα-helix:aa_changeE>V NA NA NA NA
## DSSPBend:aa_changeE>V 5.313e+01 3.561e+05 0 1
## DSSPβ-bridge:aa_changeE>V 5.313e+01 3.561e+05 0 1
## DSSP310 Helix:aa_changeE>V NA NA NA NA
## DSSPÏ€-helix:aa_changeE>V NA NA NA NA
## DSSPβ-strand:aa_changeE>V 5.313e+01 3.561e+05 0 1
## DSSPTurn:aa_changeE>V 5.313e+01 4.362e+05 0 1
## DSSPα-helix:aa_changeF>C NA NA NA NA
## DSSPBend:aa_changeF>C NA NA NA NA
## DSSPβ-bridge:aa_changeF>C NA NA NA NA
## DSSP310 Helix:aa_changeF>C NA NA NA NA
## DSSPÏ€-helix:aa_changeF>C NA NA NA NA
## DSSPβ-strand:aa_changeF>C NA NA NA NA
## DSSPTurn:aa_changeF>C NA NA NA NA
## DSSPα-helix:aa_changeF>I NA NA NA NA
## DSSPBend:aa_changeF>I -2.035e-07 5.036e+05 0 1
## DSSPβ-bridge:aa_changeF>I NA NA NA NA
## DSSP310 Helix:aa_changeF>I NA NA NA NA
## DSSPÏ€-helix:aa_changeF>I NA NA NA NA
## DSSPβ-strand:aa_changeF>I NA NA NA NA
## DSSPTurn:aa_changeF>I NA NA NA NA
## DSSPα-helix:aa_changeF>L 5.313e+01 4.112e+05 0 1
## DSSPBend:aa_changeF>L 5.313e+01 4.112e+05 0 1
## DSSPβ-bridge:aa_changeF>L NA NA NA NA
## DSSP310 Helix:aa_changeF>L NA NA NA NA
## DSSPÏ€-helix:aa_changeF>L NA NA NA NA
## DSSPβ-strand:aa_changeF>L NA NA NA NA
## DSSPTurn:aa_changeF>L 5.313e+01 4.112e+05 0 1
## DSSPα-helix:aa_changeF>V NA NA NA NA
## DSSPBend:aa_changeF>V NA NA NA NA
## DSSPβ-bridge:aa_changeF>V NA NA NA NA
## DSSP310 Helix:aa_changeF>V NA NA NA NA
## DSSPÏ€-helix:aa_changeF>V NA NA NA NA
## DSSPβ-strand:aa_changeF>V 5.313e+01 5.036e+05 0 1
## DSSPTurn:aa_changeF>V NA NA NA NA
## DSSPα-helix:aa_changeF>Y NA NA NA NA
## DSSPBend:aa_changeF>Y NA NA NA NA
## DSSPβ-bridge:aa_changeF>Y NA NA NA NA
## DSSP310 Helix:aa_changeF>Y NA NA NA NA
## DSSPÏ€-helix:aa_changeF>Y NA NA NA NA
## DSSPβ-strand:aa_changeF>Y NA NA NA NA
## DSSPTurn:aa_changeF>Y NA NA NA NA
## DSSPα-helix:aa_changeG>A NA NA NA NA
## DSSPBend:aa_changeG>A NA NA NA NA
## DSSPβ-bridge:aa_changeG>A NA NA NA NA
## DSSP310 Helix:aa_changeG>A NA NA NA NA
## DSSPÏ€-helix:aa_changeG>A NA NA NA NA
## DSSPβ-strand:aa_changeG>A 5.313e+01 4.112e+05 0 1
## DSSPTurn:aa_changeG>A 5.313e+01 5.036e+05 0 1
## DSSPα-helix:aa_changeG>C NA NA NA NA
## DSSPBend:aa_changeG>C 5.313e+01 4.112e+05 0 1
## DSSPβ-bridge:aa_changeG>C NA NA NA NA
## DSSP310 Helix:aa_changeG>C NA NA NA NA
## DSSPÏ€-helix:aa_changeG>C 5.313e+01 4.112e+05 0 1
## DSSPβ-strand:aa_changeG>C 5.313e+01 2.908e+05 0 1
## DSSPTurn:aa_changeG>C 5.313e+01 2.518e+05 0 1
## DSSPα-helix:aa_changeG>D NA NA NA NA
## DSSPBend:aa_changeG>D 5.313e+01 3.807e+05 0 1
## DSSPβ-bridge:aa_changeG>D NA NA NA NA
## DSSP310 Helix:aa_changeG>D NA NA NA NA
## DSSPÏ€-helix:aa_changeG>D 5.313e+01 3.807e+05 0 1
## DSSPβ-strand:aa_changeG>D 5.313e+01 1.843e+05 0 1
## DSSPTurn:aa_changeG>D 5.313e+01 2.457e+05 0 1
## DSSPα-helix:aa_changeG>E NA NA NA NA
## DSSPBend:aa_changeG>E NA NA NA NA
## DSSPβ-bridge:aa_changeG>E NA NA NA NA
## DSSP310 Helix:aa_changeG>E NA NA NA NA
## DSSPÏ€-helix:aa_changeG>E NA NA NA NA
## DSSPβ-strand:aa_changeG>E 5.313e+01 4.112e+05 0 1
## DSSPTurn:aa_changeG>E 5.313e+01 4.362e+05 0 1
## DSSPα-helix:aa_changeG>R NA NA NA NA
## DSSPBend:aa_changeG>R 5.313e+01 3.807e+05 0 1
## DSSPβ-bridge:aa_changeG>R 5.313e+01 3.807e+05 0 1
## DSSP310 Helix:aa_changeG>R 5.313e+01 3.807e+05 0 1
## DSSPÏ€-helix:aa_changeG>R 5.313e+01 3.807e+05 0 1
## DSSPβ-strand:aa_changeG>R 5.313e+01 1.722e+05 0 1
## DSSPTurn:aa_changeG>R 5.313e+01 2.457e+05 0 1
## DSSPα-helix:aa_changeG>S NA NA NA NA
## DSSPBend:aa_changeG>S 5.313e+01 3.982e+05 0 1
## DSSPβ-bridge:aa_changeG>S NA NA NA NA
## DSSP310 Helix:aa_changeG>S 5.313e+01 3.982e+05 0 1
## DSSPÏ€-helix:aa_changeG>S 5.313e+01 3.982e+05 0 1
## DSSPβ-strand:aa_changeG>S 5.313e+01 2.232e+05 0 1
## DSSPTurn:aa_changeG>S 5.313e+01 2.299e+05 0 1
## DSSPα-helix:aa_changeG>V NA NA NA NA
## DSSPBend:aa_changeG>V 5.313e+01 3.777e+05 0 1
## DSSPβ-bridge:aa_changeG>V NA NA NA NA
## DSSP310 Helix:aa_changeG>V NA NA NA NA
## DSSPÏ€-helix:aa_changeG>V 5.313e+01 3.777e+05 0 1
## DSSPβ-strand:aa_changeG>V 5.313e+01 1.625e+05 0 1
## DSSPTurn:aa_changeG>V 5.313e+01 1.689e+05 0 1
## DSSPα-helix:aa_changeG>W NA NA NA NA
## DSSPBend:aa_changeG>W NA NA NA NA
## DSSPβ-bridge:aa_changeG>W NA NA NA NA
## DSSP310 Helix:aa_changeG>W NA NA NA NA
## DSSPÏ€-helix:aa_changeG>W NA NA NA NA
## DSSPβ-strand:aa_changeG>W 5.313e+01 5.036e+05 0 1
## DSSPTurn:aa_changeG>W 5.313e+01 5.036e+05 0 1
## DSSPα-helix:aa_changeH>D NA NA NA NA
## DSSPBend:aa_changeH>D NA NA NA NA
## DSSPβ-bridge:aa_changeH>D NA NA NA NA
## DSSP310 Helix:aa_changeH>D NA NA NA NA
## DSSPÏ€-helix:aa_changeH>D NA NA NA NA
## DSSPβ-strand:aa_changeH>D 5.313e+01 5.036e+05 0 1
## DSSPTurn:aa_changeH>D 5.313e+01 5.036e+05 0 1
## DSSPα-helix:aa_changeH>L 5.313e+01 4.362e+05 0 1
## DSSPBend:aa_changeH>L NA NA NA NA
## DSSPβ-bridge:aa_changeH>L 5.313e+01 4.362e+05 0 1
## DSSP310 Helix:aa_changeH>L NA NA NA NA
## DSSPÏ€-helix:aa_changeH>L NA NA NA NA
## DSSPβ-strand:aa_changeH>L NA NA NA NA
## DSSPTurn:aa_changeH>L 5.313e+01 3.561e+05 0 1
## DSSPα-helix:aa_changeH>N NA NA NA NA
## DSSPBend:aa_changeH>N NA NA NA NA
## DSSPβ-bridge:aa_changeH>N NA NA NA NA
## DSSP310 Helix:aa_changeH>N NA NA NA NA
## DSSPÏ€-helix:aa_changeH>N NA NA NA NA
## DSSPβ-strand:aa_changeH>N 5.313e+01 5.036e+05 0 1
## DSSPTurn:aa_changeH>N 5.313e+01 5.036e+05 0 1
## DSSPα-helix:aa_changeH>P 5.313e+01 4.362e+05 0 1
## DSSPBend:aa_changeH>P NA NA NA NA
## DSSPβ-bridge:aa_changeH>P NA NA NA NA
## DSSP310 Helix:aa_changeH>P NA NA NA NA
## DSSPÏ€-helix:aa_changeH>P NA NA NA NA
## DSSPβ-strand:aa_changeH>P 5.313e+01 3.561e+05 0 1
## DSSPTurn:aa_changeH>P 5.313e+01 3.561e+05 0 1
## DSSPα-helix:aa_changeH>Q NA NA NA NA
## DSSPBend:aa_changeH>Q NA NA NA NA
## DSSPβ-bridge:aa_changeH>Q NA NA NA NA
## DSSP310 Helix:aa_changeH>Q 5.313e+01 4.112e+05 0 1
## DSSPÏ€-helix:aa_changeH>Q NA NA NA NA
## DSSPβ-strand:aa_changeH>Q 5.313e+01 3.251e+05 0 1
## DSSPTurn:aa_changeH>Q 5.313e+01 3.251e+05 0 1
## DSSPα-helix:aa_changeH>R 5.313e+01 3.982e+05 0 1
## DSSPBend:aa_changeH>R NA NA NA NA
## DSSPβ-bridge:aa_changeH>R NA NA NA NA
## DSSP310 Helix:aa_changeH>R NA NA NA NA
## DSSPÏ€-helix:aa_changeH>R NA NA NA NA
## DSSPβ-strand:aa_changeH>R NA NA NA NA
## DSSPTurn:aa_changeH>R 5.313e+01 3.084e+05 0 1
## DSSPα-helix:aa_changeH>Y NA NA NA NA
## DSSPBend:aa_changeH>Y NA NA NA NA
## DSSPβ-bridge:aa_changeH>Y NA NA NA NA
## DSSP310 Helix:aa_changeH>Y NA NA NA NA
## DSSPÏ€-helix:aa_changeH>Y NA NA NA NA
## DSSPβ-strand:aa_changeH>Y NA NA NA NA
## DSSPTurn:aa_changeH>Y 5.313e+01 3.251e+05 0 1
## DSSPα-helix:aa_changeI>F 5.313e+01 5.036e+05 0 1
## DSSPBend:aa_changeI>F 5.313e+01 5.036e+05 0 1
## DSSPβ-bridge:aa_changeI>F NA NA NA NA
## DSSP310 Helix:aa_changeI>F NA NA NA NA
## DSSPÏ€-helix:aa_changeI>F NA NA NA NA
## DSSPβ-strand:aa_changeI>F 5.313e+01 3.982e+05 0 1
## DSSPTurn:aa_changeI>F NA NA NA NA
## DSSPα-helix:aa_changeI>M NA NA NA NA
## DSSPBend:aa_changeI>M NA NA NA NA
## DSSPβ-bridge:aa_changeI>M NA NA NA NA
## DSSP310 Helix:aa_changeI>M NA NA NA NA
## DSSPÏ€-helix:aa_changeI>M NA NA NA NA
## DSSPβ-strand:aa_changeI>M NA NA NA NA
## DSSPTurn:aa_changeI>M NA NA NA NA
## DSSPα-helix:aa_changeI>N NA NA NA NA
## DSSPBend:aa_changeI>N -1.919e-07 3.847e+05 0 1
## DSSPβ-bridge:aa_changeI>N 5.002e-09 3.847e+05 0 1
## DSSP310 Helix:aa_changeI>N NA NA NA NA
## DSSPÏ€-helix:aa_changeI>N NA NA NA NA
## DSSPβ-strand:aa_changeI>N NA NA NA NA
## DSSPTurn:aa_changeI>N NA NA NA NA
## DSSPα-helix:aa_changeI>T NA NA NA NA
## DSSPBend:aa_changeI>T NA NA NA NA
## DSSPβ-bridge:aa_changeI>T NA NA NA NA
## DSSP310 Helix:aa_changeI>T NA NA NA NA
## DSSPÏ€-helix:aa_changeI>T NA NA NA NA
## DSSPβ-strand:aa_changeI>T NA NA NA NA
## DSSPTurn:aa_changeI>T NA NA NA NA
## DSSPα-helix:aa_changeI>V NA NA NA NA
## DSSPBend:aa_changeI>V 5.313e+01 4.362e+05 0 1
## DSSPβ-bridge:aa_changeI>V NA NA NA NA
## DSSP310 Helix:aa_changeI>V NA NA NA NA
## DSSPÏ€-helix:aa_changeI>V NA NA NA NA
## DSSPβ-strand:aa_changeI>V 5.313e+01 4.362e+05 0 1
## DSSPTurn:aa_changeI>V NA NA NA NA
## DSSPα-helix:aa_changeK>E 5.313e+01 5.036e+05 0 1
## DSSPBend:aa_changeK>E NA NA NA NA
## DSSPβ-bridge:aa_changeK>E NA NA NA NA
## DSSP310 Helix:aa_changeK>E NA NA NA NA
## DSSPÏ€-helix:aa_changeK>E NA NA NA NA
## DSSPβ-strand:aa_changeK>E 5.313e+01 4.112e+05 0 1
## DSSPTurn:aa_changeK>E NA NA NA NA
## DSSPα-helix:aa_changeK>N -4.078e-08 4.362e+05 0 1
## DSSPBend:aa_changeK>N NA NA NA NA
## DSSPβ-bridge:aa_changeK>N NA NA NA NA
## DSSP310 Helix:aa_changeK>N NA NA NA NA
## DSSPÏ€-helix:aa_changeK>N NA NA NA NA
## DSSPβ-strand:aa_changeK>N NA NA NA NA
## DSSPTurn:aa_changeK>N NA NA NA NA
## DSSPα-helix:aa_changeK>Q -1.946e-07 5.036e+05 0 1
## DSSPBend:aa_changeK>Q NA NA NA NA
## DSSPβ-bridge:aa_changeK>Q NA NA NA NA
## DSSP310 Helix:aa_changeK>Q NA NA NA NA
## DSSPÏ€-helix:aa_changeK>Q NA NA NA NA
## DSSPβ-strand:aa_changeK>Q NA NA NA NA
## DSSPTurn:aa_changeK>Q NA NA NA NA
## DSSPα-helix:aa_changeL>F NA NA NA NA
## DSSPBend:aa_changeL>F NA NA NA NA
## DSSPβ-bridge:aa_changeL>F NA NA NA NA
## DSSP310 Helix:aa_changeL>F NA NA NA NA
## DSSPÏ€-helix:aa_changeL>F NA NA NA NA
## DSSPβ-strand:aa_changeL>F 5.313e+01 5.036e+05 0 1
## DSSPTurn:aa_changeL>F NA NA NA NA
## DSSPα-helix:aa_changeL>H NA NA NA NA
## DSSPBend:aa_changeL>H NA NA NA NA
## DSSPβ-bridge:aa_changeL>H NA NA NA NA
## DSSP310 Helix:aa_changeL>H NA NA NA NA
## DSSPÏ€-helix:aa_changeL>H NA NA NA NA
## DSSPβ-strand:aa_changeL>H 5.313e+01 5.036e+05 0 1
## DSSPTurn:aa_changeL>H NA NA NA NA
## DSSPα-helix:aa_changeL>I 5.313e+01 5.036e+05 0 1
## DSSPBend:aa_changeL>I NA NA NA NA
## DSSPβ-bridge:aa_changeL>I NA NA NA NA
## DSSP310 Helix:aa_changeL>I NA NA NA NA
## DSSPÏ€-helix:aa_changeL>I NA NA NA NA
## DSSPβ-strand:aa_changeL>I 5.313e+01 5.036e+05 0 1
## DSSPTurn:aa_changeL>I NA NA NA NA
## DSSPα-helix:aa_changeL>M 5.313e+01 4.112e+05 0 1
## DSSPBend:aa_changeL>M NA NA NA NA
## DSSPβ-bridge:aa_changeL>M NA NA NA NA
## DSSP310 Helix:aa_changeL>M NA NA NA NA
## DSSPÏ€-helix:aa_changeL>M NA NA NA NA
## DSSPβ-strand:aa_changeL>M 5.313e+01 3.251e+05 0 1
## DSSPTurn:aa_changeL>M NA NA NA NA
## DSSPα-helix:aa_changeL>P 5.313e+01 3.251e+05 0 1
## DSSPBend:aa_changeL>P 5.313e+01 4.112e+05 0 1
## DSSPβ-bridge:aa_changeL>P NA NA NA NA
## DSSP310 Helix:aa_changeL>P NA NA NA NA
## DSSPÏ€-helix:aa_changeL>P NA NA NA NA
## DSSPβ-strand:aa_changeL>P 5.313e+01 2.344e+05 0 1
## DSSPTurn:aa_changeL>P 5.313e+01 4.112e+05 0 1
## DSSPα-helix:aa_changeL>Q 5.313e+01 4.362e+05 0 1
## DSSPBend:aa_changeL>Q NA NA NA NA
## DSSPβ-bridge:aa_changeL>Q NA NA NA NA
## DSSP310 Helix:aa_changeL>Q NA NA NA NA
## DSSPÏ€-helix:aa_changeL>Q NA NA NA NA
## DSSPβ-strand:aa_changeL>Q 5.313e+01 3.251e+05 0 1
## DSSPTurn:aa_changeL>Q NA NA NA NA
## DSSPα-helix:aa_changeL>R 5.313e+01 4.362e+05 0 1
## DSSPBend:aa_changeL>R 5.313e+01 4.362e+05 0 1
## DSSPβ-bridge:aa_changeL>R NA NA NA NA
## DSSP310 Helix:aa_changeL>R 5.313e+01 4.362e+05 0 1
## DSSPÏ€-helix:aa_changeL>R NA NA NA NA
## DSSPβ-strand:aa_changeL>R 5.313e+01 2.980e+05 0 1
## DSSPTurn:aa_changeL>R NA NA NA NA
## DSSPα-helix:aa_changeL>V NA NA NA NA
## DSSPBend:aa_changeL>V NA NA NA NA
## DSSPβ-bridge:aa_changeL>V NA NA NA NA
## DSSP310 Helix:aa_changeL>V NA NA NA NA
## DSSPÏ€-helix:aa_changeL>V NA NA NA NA
## DSSPβ-strand:aa_changeL>V 5.313e+01 4.362e+05 0 1
## DSSPTurn:aa_changeL>V NA NA NA NA
## DSSPα-helix:aa_changeM>I 5.313e+01 3.561e+05 0 1
## DSSPBend:aa_changeM>I NA NA NA NA
## DSSPβ-bridge:aa_changeM>I NA NA NA NA
## DSSP310 Helix:aa_changeM>I NA NA NA NA
## DSSPÏ€-helix:aa_changeM>I NA NA NA NA
## DSSPβ-strand:aa_changeM>I NA NA NA NA
## DSSPTurn:aa_changeM>I 5.313e+01 4.362e+05 0 1
## DSSPα-helix:aa_changeM>K 5.313e+01 5.036e+05 0 1
## DSSPBend:aa_changeM>K 5.313e+01 5.036e+05 0 1
## DSSPβ-bridge:aa_changeM>K 5.313e+01 5.036e+05 0 1
## DSSP310 Helix:aa_changeM>K NA NA NA NA
## DSSPÏ€-helix:aa_changeM>K NA NA NA NA
## DSSPβ-strand:aa_changeM>K NA NA NA NA
## DSSPTurn:aa_changeM>K NA NA NA NA
## DSSPα-helix:aa_changeM>L NA NA NA NA
## DSSPBend:aa_changeM>L 5.313e+01 4.362e+05 0 1
## DSSPβ-bridge:aa_changeM>L NA NA NA NA
## DSSP310 Helix:aa_changeM>L NA NA NA NA
## DSSPÏ€-helix:aa_changeM>L NA NA NA NA
## DSSPβ-strand:aa_changeM>L NA NA NA NA
## DSSPTurn:aa_changeM>L NA NA NA NA
## DSSPα-helix:aa_changeM>R 5.313e+01 3.561e+05 0 1
## DSSPBend:aa_changeM>R NA NA NA NA
## DSSPβ-bridge:aa_changeM>R 5.313e+01 4.362e+05 0 1
## DSSP310 Helix:aa_changeM>R NA NA NA NA
## DSSPÏ€-helix:aa_changeM>R NA NA NA NA
## DSSPβ-strand:aa_changeM>R NA NA NA NA
## DSSPTurn:aa_changeM>R NA NA NA NA
## DSSPα-helix:aa_changeM>T NA NA NA NA
## DSSPBend:aa_changeM>T NA NA NA NA
## DSSPβ-bridge:aa_changeM>T NA NA NA NA
## DSSP310 Helix:aa_changeM>T NA NA NA NA
## DSSPÏ€-helix:aa_changeM>T NA NA NA NA
## DSSPβ-strand:aa_changeM>T NA NA NA NA
## DSSPTurn:aa_changeM>T NA NA NA NA
## DSSPα-helix:aa_changeM>V NA NA NA NA
## DSSPBend:aa_changeM>V NA NA NA NA
## DSSPβ-bridge:aa_changeM>V 4.616e-06 5.036e+05 0 1
## DSSP310 Helix:aa_changeM>V NA NA NA NA
## DSSPÏ€-helix:aa_changeM>V NA NA NA NA
## DSSPβ-strand:aa_changeM>V NA NA NA NA
## DSSPTurn:aa_changeM>V NA NA NA NA
## DSSPα-helix:aa_changeN>D 5.391e-09 5.036e+05 0 1
## DSSPBend:aa_changeN>D 1.929e-07 5.036e+05 0 1
## DSSPβ-bridge:aa_changeN>D NA NA NA NA
## DSSP310 Helix:aa_changeN>D NA NA NA NA
## DSSPÏ€-helix:aa_changeN>D NA NA NA NA
## DSSPβ-strand:aa_changeN>D -1.518e-09 5.036e+05 0 1
## DSSPTurn:aa_changeN>D NA NA NA NA
## DSSPα-helix:aa_changeN>H NA NA NA NA
## DSSPBend:aa_changeN>H NA NA NA NA
## DSSPβ-bridge:aa_changeN>H NA NA NA NA
## DSSP310 Helix:aa_changeN>H NA NA NA NA
## DSSPÏ€-helix:aa_changeN>H NA NA NA NA
## DSSPβ-strand:aa_changeN>H 5.313e+01 5.036e+05 0 1
## DSSPTurn:aa_changeN>H NA NA NA NA
## DSSPα-helix:aa_changeN>I 5.313e+01 4.112e+05 0 1
## DSSPBend:aa_changeN>I 5.313e+01 2.908e+05 0 1
## DSSPβ-bridge:aa_changeN>I NA NA NA NA
## DSSP310 Helix:aa_changeN>I NA NA NA NA
## DSSPÏ€-helix:aa_changeN>I NA NA NA NA
## DSSPβ-strand:aa_changeN>I 5.313e+01 3.251e+05 0 1
## DSSPTurn:aa_changeN>I 5.313e+01 4.112e+05 0 1
## DSSPα-helix:aa_changeN>K 5.313e+01 3.982e+05 0 1
## DSSPBend:aa_changeN>K 5.313e+01 2.389e+05 0 1
## DSSPβ-bridge:aa_changeN>K NA NA NA NA
## DSSP310 Helix:aa_changeN>K NA NA NA NA
## DSSPÏ€-helix:aa_changeN>K NA NA NA NA
## DSSPβ-strand:aa_changeN>K 5.313e+01 3.084e+05 0 1
## DSSPTurn:aa_changeN>K 5.313e+01 3.084e+05 0 1
## DSSPα-helix:aa_changeN>S 1.811e-07 4.362e+05 0 1
## DSSPBend:aa_changeN>S NA NA NA NA
## DSSPβ-bridge:aa_changeN>S NA NA NA NA
## DSSP310 Helix:aa_changeN>S NA NA NA NA
## DSSPÏ€-helix:aa_changeN>S NA NA NA NA
## DSSPβ-strand:aa_changeN>S NA NA NA NA
## DSSPTurn:aa_changeN>S NA NA NA NA
## DSSPα-helix:aa_changeN>T NA NA NA NA
## DSSPBend:aa_changeN>T 5.313e+01 4.362e+05 0 1
## DSSPβ-bridge:aa_changeN>T NA NA NA NA
## DSSP310 Helix:aa_changeN>T NA NA NA NA
## DSSPÏ€-helix:aa_changeN>T NA NA NA NA
## DSSPβ-strand:aa_changeN>T NA NA NA NA
## DSSPTurn:aa_changeN>T NA NA NA NA
## DSSPα-helix:aa_changeN>Y -1.799e-07 5.036e+05 0 1
## DSSPBend:aa_changeN>Y -1.895e-07 3.982e+05 0 1
## DSSPβ-bridge:aa_changeN>Y NA NA NA NA
## DSSP310 Helix:aa_changeN>Y NA NA NA NA
## DSSPÏ€-helix:aa_changeN>Y NA NA NA NA
## DSSPβ-strand:aa_changeN>Y NA NA NA NA
## DSSPTurn:aa_changeN>Y NA NA NA NA
## DSSPα-helix:aa_changeP>A NA NA NA NA
## DSSPBend:aa_changeP>A 5.313e+01 4.362e+05 0 1
## DSSPβ-bridge:aa_changeP>A NA NA NA NA
## DSSP310 Helix:aa_changeP>A 5.313e+01 4.362e+05 0 1
## DSSPÏ€-helix:aa_changeP>A NA NA NA NA
## DSSPβ-strand:aa_changeP>A NA NA NA NA
## DSSPTurn:aa_changeP>A NA NA NA NA
## DSSPα-helix:aa_changeP>H NA NA NA NA
## DSSPBend:aa_changeP>H NA NA NA NA
## DSSPβ-bridge:aa_changeP>H NA NA NA NA
## DSSP310 Helix:aa_changeP>H NA NA NA NA
## DSSPÏ€-helix:aa_changeP>H NA NA NA NA
## DSSPβ-strand:aa_changeP>H NA NA NA NA
## DSSPTurn:aa_changeP>H 5.313e+01 4.362e+05 0 1
## DSSPα-helix:aa_changeP>L NA NA NA NA
## DSSPBend:aa_changeP>L 5.313e+01 2.299e+05 0 1
## DSSPβ-bridge:aa_changeP>L NA NA NA NA
## DSSP310 Helix:aa_changeP>L NA NA NA NA
## DSSPÏ€-helix:aa_changeP>L NA NA NA NA
## DSSPβ-strand:aa_changeP>L 5.313e+01 3.847e+05 0 1
## DSSPTurn:aa_changeP>L 5.313e+01 2.908e+05 0 1
## DSSPα-helix:aa_changeP>Q NA NA NA NA
## DSSPBend:aa_changeP>Q 5.313e+01 3.735e+05 0 1
## DSSPβ-bridge:aa_changeP>Q NA NA NA NA
## DSSP310 Helix:aa_changeP>Q NA NA NA NA
## DSSPÏ€-helix:aa_changeP>Q NA NA NA NA
## DSSPβ-strand:aa_changeP>Q 5.313e+01 3.735e+05 0 1
## DSSPTurn:aa_changeP>Q 5.313e+01 3.735e+05 0 1
## DSSPα-helix:aa_changeP>R NA NA NA NA
## DSSPBend:aa_changeP>R 5.313e+01 3.901e+05 0 1
## DSSPβ-bridge:aa_changeP>R NA NA NA NA
## DSSP310 Helix:aa_changeP>R NA NA NA NA
## DSSPÏ€-helix:aa_changeP>R NA NA NA NA
## DSSPβ-strand:aa_changeP>R NA NA NA NA
## DSSPTurn:aa_changeP>R 5.313e+01 3.901e+05 0 1
## DSSPα-helix:aa_changeP>S 5.313e+01 4.112e+05 0 1
## DSSPBend:aa_changeP>S 5.313e+01 4.112e+05 0 1
## DSSPβ-bridge:aa_changeP>S NA NA NA NA
## DSSP310 Helix:aa_changeP>S NA NA NA NA
## DSSPÏ€-helix:aa_changeP>S NA NA NA NA
## DSSPβ-strand:aa_changeP>S NA NA NA NA
## DSSPTurn:aa_changeP>S 5.313e+01 3.251e+05 0 1
## DSSPα-helix:aa_changeP>T NA NA NA NA
## DSSPBend:aa_changeP>T 5.313e+01 2.601e+05 0 1
## DSSPβ-bridge:aa_changeP>T NA NA NA NA
## DSSP310 Helix:aa_changeP>T NA NA NA NA
## DSSPÏ€-helix:aa_changeP>T NA NA NA NA
## DSSPβ-strand:aa_changeP>T 5.313e+01 3.901e+05 0 1
## DSSPTurn:aa_changeP>T 5.313e+01 2.980e+05 0 1
## DSSPα-helix:aa_changeQ>E -4.443e-06 4.362e+05 0 1
## DSSPBend:aa_changeQ>E NA NA NA NA
## DSSPβ-bridge:aa_changeQ>E NA NA NA NA
## DSSP310 Helix:aa_changeQ>E NA NA NA NA
## DSSPÏ€-helix:aa_changeQ>E NA NA NA NA
## DSSPβ-strand:aa_changeQ>E -3.677e-08 4.362e+05 0 1
## DSSPTurn:aa_changeQ>E NA NA NA NA
## DSSPα-helix:aa_changeQ>H 5.313e+01 3.251e+05 0 1
## DSSPBend:aa_changeQ>H NA NA NA NA
## DSSPβ-bridge:aa_changeQ>H NA NA NA NA
## DSSP310 Helix:aa_changeQ>H NA NA NA NA
## DSSPÏ€-helix:aa_changeQ>H NA NA NA NA
## DSSPβ-strand:aa_changeQ>H 5.313e+01 2.908e+05 0 1
## DSSPTurn:aa_changeQ>H 5.313e+01 4.362e+05 0 1
## DSSPα-helix:aa_changeQ>K 5.313e+01 2.980e+05 0 1
## DSSPBend:aa_changeQ>K NA NA NA NA
## DSSPβ-bridge:aa_changeQ>K NA NA NA NA
## DSSP310 Helix:aa_changeQ>K NA NA NA NA
## DSSPÏ€-helix:aa_changeQ>K NA NA NA NA
## DSSPβ-strand:aa_changeQ>K 5.313e+01 2.980e+05 0 1
## DSSPTurn:aa_changeQ>K 5.313e+01 3.901e+05 0 1
## DSSPα-helix:aa_changeQ>L NA NA NA NA
## DSSPBend:aa_changeQ>L NA NA NA NA
## DSSPβ-bridge:aa_changeQ>L NA NA NA NA
## DSSP310 Helix:aa_changeQ>L NA NA NA NA
## DSSPÏ€-helix:aa_changeQ>L NA NA NA NA
## DSSPβ-strand:aa_changeQ>L NA NA NA NA
## DSSPTurn:aa_changeQ>L NA NA NA NA
## DSSPα-helix:aa_changeQ>P NA NA NA NA
## DSSPBend:aa_changeQ>P NA NA NA NA
## DSSPβ-bridge:aa_changeQ>P NA NA NA NA
## DSSP310 Helix:aa_changeQ>P NA NA NA NA
## DSSPÏ€-helix:aa_changeQ>P NA NA NA NA
## DSSPβ-strand:aa_changeQ>P 5.313e+01 4.362e+05 0 1
## DSSPTurn:aa_changeQ>P 5.313e+01 4.362e+05 0 1
## DSSPα-helix:aa_changeQ>R 5.313e+01 5.036e+05 0 1
## DSSPBend:aa_changeQ>R NA NA NA NA
## DSSPβ-bridge:aa_changeQ>R NA NA NA NA
## DSSP310 Helix:aa_changeQ>R 5.313e+01 5.036e+05 0 1
## DSSPÏ€-helix:aa_changeQ>R NA NA NA NA
## DSSPβ-strand:aa_changeQ>R 5.313e+01 5.036e+05 0 1
## DSSPTurn:aa_changeQ>R 5.313e+01 5.036e+05 0 1
## DSSPα-helix:aa_changeR>C 5.313e+01 2.908e+05 0 1
## DSSPBend:aa_changeR>C 5.313e+01 3.251e+05 0 1
## DSSPβ-bridge:aa_changeR>C 5.313e+01 4.112e+05 0 1
## DSSP310 Helix:aa_changeR>C NA NA NA NA
## DSSPÏ€-helix:aa_changeR>C NA NA NA NA
## DSSPβ-strand:aa_changeR>C 5.313e+01 2.601e+05 0 1
## DSSPTurn:aa_changeR>C 5.313e+01 4.112e+05 0 1
## DSSPα-helix:aa_changeR>G NA NA NA NA
## DSSPBend:aa_changeR>G NA NA NA NA
## DSSPβ-bridge:aa_changeR>G 5.313e+01 4.362e+05 0 1
## DSSP310 Helix:aa_changeR>G NA NA NA NA
## DSSPÏ€-helix:aa_changeR>G NA NA NA NA
## DSSPβ-strand:aa_changeR>G 5.313e+01 2.980e+05 0 1
## DSSPTurn:aa_changeR>G NA NA NA NA
## DSSPα-helix:aa_changeR>H 5.313e+01 3.251e+05 0 1
## DSSPBend:aa_changeR>H 5.313e+01 3.561e+05 0 1
## DSSPβ-bridge:aa_changeR>H 5.313e+01 4.362e+05 0 1
## DSSP310 Helix:aa_changeR>H NA NA NA NA
## DSSPÏ€-helix:aa_changeR>H NA NA NA NA
## DSSPβ-strand:aa_changeR>H 5.313e+01 3.561e+05 0 1
## DSSPTurn:aa_changeR>H 5.313e+01 3.561e+05 0 1
## DSSPα-helix:aa_changeR>L 5.313e+01 3.084e+05 0 1
## DSSPBend:aa_changeR>L 5.313e+01 3.084e+05 0 1
## DSSPβ-bridge:aa_changeR>L 5.313e+01 3.982e+05 0 1
## DSSP310 Helix:aa_changeR>L 5.313e+01 3.982e+05 0 1
## DSSPÏ€-helix:aa_changeR>L NA NA NA NA
## DSSPβ-strand:aa_changeR>L 5.313e+01 2.232e+05 0 1
## DSSPTurn:aa_changeR>L NA NA NA NA
## DSSPα-helix:aa_changeR>P 5.313e+01 3.251e+05 0 1
## DSSPBend:aa_changeR>P 5.313e+01 2.908e+05 0 1
## DSSPβ-bridge:aa_changeR>P 5.313e+01 4.112e+05 0 1
## DSSP310 Helix:aa_changeR>P 5.313e+01 4.112e+05 0 1
## DSSPÏ€-helix:aa_changeR>P 5.313e+01 4.112e+05 0 1
## DSSPβ-strand:aa_changeR>P 5.313e+01 2.518e+05 0 1
## DSSPTurn:aa_changeR>P 5.313e+01 4.112e+05 0 1
## DSSPα-helix:aa_changeR>Q NA NA NA NA
## DSSPBend:aa_changeR>Q NA NA NA NA
## DSSPβ-bridge:aa_changeR>Q NA NA NA NA
## DSSP310 Helix:aa_changeR>Q NA NA NA NA
## DSSPÏ€-helix:aa_changeR>Q NA NA NA NA
## DSSPβ-strand:aa_changeR>Q NA NA NA NA
## DSSPTurn:aa_changeR>Q NA NA NA NA
## DSSPα-helix:aa_changeR>S 5.313e+01 3.901e+05 0 1
## DSSPBend:aa_changeR>S 5.313e+01 3.901e+05 0 1
## DSSPβ-bridge:aa_changeR>S 5.313e+01 2.980e+05 0 1
## DSSP310 Helix:aa_changeR>S NA NA NA NA
## DSSPÏ€-helix:aa_changeR>S 5.313e+01 3.901e+05 0 1
## DSSPβ-strand:aa_changeR>S 5.313e+01 2.980e+05 0 1
## DSSPTurn:aa_changeR>S NA NA NA NA
## DSSPα-helix:aa_changeR>W NA NA NA NA
## DSSPBend:aa_changeR>W NA NA NA NA
## DSSPβ-bridge:aa_changeR>W NA NA NA NA
## DSSP310 Helix:aa_changeR>W NA NA NA NA
## DSSPÏ€-helix:aa_changeR>W NA NA NA NA
## DSSPβ-strand:aa_changeR>W NA NA NA NA
## DSSPTurn:aa_changeR>W NA NA NA NA
## DSSPα-helix:aa_changeS>C NA NA NA NA
## DSSPBend:aa_changeS>C 5.313e+01 3.561e+05 0 1
## DSSPβ-bridge:aa_changeS>C NA NA NA NA
## DSSP310 Helix:aa_changeS>C NA NA NA NA
## DSSPÏ€-helix:aa_changeS>C NA NA NA NA
## DSSPβ-strand:aa_changeS>C 5.313e+01 4.362e+05 0 1
## DSSPTurn:aa_changeS>C NA NA NA NA
## DSSPα-helix:aa_changeS>F NA NA NA NA
## DSSPBend:aa_changeS>F NA NA NA NA
## DSSPβ-bridge:aa_changeS>F NA NA NA NA
## DSSP310 Helix:aa_changeS>F NA NA NA NA
## DSSPÏ€-helix:aa_changeS>F NA NA NA NA
## DSSPβ-strand:aa_changeS>F NA NA NA NA
## DSSPTurn:aa_changeS>F NA NA NA NA
## DSSPα-helix:aa_changeS>G NA NA NA NA
## DSSPBend:aa_changeS>G NA NA NA NA
## DSSPβ-bridge:aa_changeS>G NA NA NA NA
## DSSP310 Helix:aa_changeS>G NA NA NA NA
## DSSPÏ€-helix:aa_changeS>G NA NA NA NA
## DSSPβ-strand:aa_changeS>G NA NA NA NA
## DSSPTurn:aa_changeS>G 5.313e+01 4.362e+05 0 1
## DSSPα-helix:aa_changeS>I NA NA NA NA
## DSSPBend:aa_changeS>I NA NA NA NA
## DSSPβ-bridge:aa_changeS>I NA NA NA NA
## DSSP310 Helix:aa_changeS>I NA NA NA NA
## DSSPÏ€-helix:aa_changeS>I NA NA NA NA
## DSSPβ-strand:aa_changeS>I NA NA NA NA
## DSSPTurn:aa_changeS>I 5.313e+01 4.362e+05 0 1
## DSSPα-helix:aa_changeS>L NA NA NA NA
## DSSPBend:aa_changeS>L 5.313e+01 5.036e+05 0 1
## DSSPβ-bridge:aa_changeS>L NA NA NA NA
## DSSP310 Helix:aa_changeS>L NA NA NA NA
## DSSPÏ€-helix:aa_changeS>L NA NA NA NA
## DSSPβ-strand:aa_changeS>L 5.313e+01 4.112e+05 0 1
## DSSPTurn:aa_changeS>L 5.313e+01 5.036e+05 0 1
## DSSPα-helix:aa_changeS>N NA NA NA NA
## DSSPBend:aa_changeS>N NA NA NA NA
## DSSPβ-bridge:aa_changeS>N NA NA NA NA
## DSSP310 Helix:aa_changeS>N NA NA NA NA
## DSSPÏ€-helix:aa_changeS>N NA NA NA NA
## DSSPβ-strand:aa_changeS>N 5.313e+01 4.362e+05 0 1
## DSSPTurn:aa_changeS>N 5.313e+01 4.362e+05 0 1
## DSSPα-helix:aa_changeS>P NA NA NA NA
## DSSPBend:aa_changeS>P NA NA NA NA
## DSSPβ-bridge:aa_changeS>P 5.313e+01 3.982e+05 0 1
## DSSP310 Helix:aa_changeS>P NA NA NA NA
## DSSPÏ€-helix:aa_changeS>P NA NA NA NA
## DSSPβ-strand:aa_changeS>P 5.313e+01 2.720e+05 0 1
## DSSPTurn:aa_changeS>P NA NA NA NA
## DSSPα-helix:aa_changeS>R NA NA NA NA
## DSSPBend:aa_changeS>R NA NA NA NA
## DSSPβ-bridge:aa_changeS>R NA NA NA NA
## DSSP310 Helix:aa_changeS>R NA NA NA NA
## DSSPÏ€-helix:aa_changeS>R NA NA NA NA
## DSSPβ-strand:aa_changeS>R 5.313e+01 2.518e+05 0 1
## DSSPTurn:aa_changeS>R 5.313e+01 2.908e+05 0 1
## DSSPα-helix:aa_changeS>T NA NA NA NA
## DSSPBend:aa_changeS>T NA NA NA NA
## DSSPβ-bridge:aa_changeS>T NA NA NA NA
## DSSP310 Helix:aa_changeS>T NA NA NA NA
## DSSPÏ€-helix:aa_changeS>T NA NA NA NA
## DSSPβ-strand:aa_changeS>T NA NA NA NA
## DSSPTurn:aa_changeS>T NA NA NA NA
## DSSPα-helix:aa_changeS>W NA NA NA NA
## DSSPBend:aa_changeS>W -4.617e-06 5.036e+05 0 1
## DSSPβ-bridge:aa_changeS>W NA NA NA NA
## DSSP310 Helix:aa_changeS>W NA NA NA NA
## DSSPÏ€-helix:aa_changeS>W NA NA NA NA
## DSSPβ-strand:aa_changeS>W 2.336e-08 5.036e+05 0 1
## DSSPTurn:aa_changeS>W NA NA NA NA
## DSSPα-helix:aa_changeS>Y NA NA NA NA
## DSSPBend:aa_changeS>Y NA NA NA NA
## DSSPβ-bridge:aa_changeS>Y NA NA NA NA
## DSSP310 Helix:aa_changeS>Y NA NA NA NA
## DSSPÏ€-helix:aa_changeS>Y NA NA NA NA
## DSSPβ-strand:aa_changeS>Y NA NA NA NA
## DSSPTurn:aa_changeS>Y NA NA NA NA
## DSSPα-helix:aa_changeT>A 5.313e+01 5.036e+05 0 1
## DSSPBend:aa_changeT>A NA NA NA NA
## DSSPβ-bridge:aa_changeT>A NA NA NA NA
## DSSP310 Helix:aa_changeT>A NA NA NA NA
## DSSPÏ€-helix:aa_changeT>A NA NA NA NA
## DSSPβ-strand:aa_changeT>A 5.313e+01 4.362e+05 0 1
## DSSPTurn:aa_changeT>A NA NA NA NA
## DSSPα-helix:aa_changeT>I NA NA NA NA
## DSSPBend:aa_changeT>I 4.447e-06 5.036e+05 0 1
## DSSPβ-bridge:aa_changeT>I NA NA NA NA
## DSSP310 Helix:aa_changeT>I NA NA NA NA
## DSSPÏ€-helix:aa_changeT>I NA NA NA NA
## DSSPβ-strand:aa_changeT>I NA NA NA NA
## DSSPTurn:aa_changeT>I NA NA NA NA
## DSSPα-helix:aa_changeT>K 2.294e-07 4.112e+05 0 1
## DSSPBend:aa_changeT>K NA NA NA NA
## DSSPβ-bridge:aa_changeT>K NA NA NA NA
## DSSP310 Helix:aa_changeT>K NA NA NA NA
## DSSPÏ€-helix:aa_changeT>K NA NA NA NA
## DSSPβ-strand:aa_changeT>K NA NA NA NA
## DSSPTurn:aa_changeT>K NA NA NA NA
## DSSPα-helix:aa_changeT>M -4.420e-06 5.036e+05 0 1
## DSSPBend:aa_changeT>M -4.856e-10 5.036e+05 0 1
## DSSPβ-bridge:aa_changeT>M NA NA NA NA
## DSSP310 Helix:aa_changeT>M NA NA NA NA
## DSSPÏ€-helix:aa_changeT>M NA NA NA NA
## DSSPβ-strand:aa_changeT>M NA NA NA NA
## DSSPTurn:aa_changeT>M NA NA NA NA
## DSSPα-helix:aa_changeT>N NA NA NA NA
## DSSPBend:aa_changeT>N NA NA NA NA
## DSSPβ-bridge:aa_changeT>N NA NA NA NA
## DSSP310 Helix:aa_changeT>N NA NA NA NA
## DSSPÏ€-helix:aa_changeT>N NA NA NA NA
## DSSPβ-strand:aa_changeT>N NA NA NA NA
## DSSPTurn:aa_changeT>N NA NA NA NA
## DSSPα-helix:aa_changeT>P NA NA NA NA
## DSSPBend:aa_changeT>P NA NA NA NA
## DSSPβ-bridge:aa_changeT>P NA NA NA NA
## DSSP310 Helix:aa_changeT>P NA NA NA NA
## DSSPÏ€-helix:aa_changeT>P NA NA NA NA
## DSSPβ-strand:aa_changeT>P 5.313e+01 2.908e+05 0 1
## DSSPTurn:aa_changeT>P 5.313e+01 4.112e+05 0 1
## DSSPα-helix:aa_changeT>R NA NA NA NA
## DSSPBend:aa_changeT>R NA NA NA NA
## DSSPβ-bridge:aa_changeT>R NA NA NA NA
## DSSP310 Helix:aa_changeT>R NA NA NA NA
## DSSPÏ€-helix:aa_changeT>R NA NA NA NA
## DSSPβ-strand:aa_changeT>R NA NA NA NA
## DSSPTurn:aa_changeT>R NA NA NA NA
## DSSPα-helix:aa_changeT>S 5.313e+01 5.036e+05 0 1
## DSSPBend:aa_changeT>S NA NA NA NA
## DSSPβ-bridge:aa_changeT>S NA NA NA NA
## DSSP310 Helix:aa_changeT>S NA NA NA NA
## DSSPÏ€-helix:aa_changeT>S NA NA NA NA
## DSSPβ-strand:aa_changeT>S 5.313e+01 4.362e+05 0 1
## DSSPTurn:aa_changeT>S NA NA NA NA
## DSSPα-helix:aa_changeV>A NA NA NA NA
## DSSPBend:aa_changeV>A NA NA NA NA
## DSSPβ-bridge:aa_changeV>A NA NA NA NA
## DSSP310 Helix:aa_changeV>A NA NA NA NA
## DSSPÏ€-helix:aa_changeV>A NA NA NA NA
## DSSPβ-strand:aa_changeV>A NA NA NA NA
## DSSPTurn:aa_changeV>A NA NA NA NA
## DSSPα-helix:aa_changeV>D NA NA NA NA
## DSSPBend:aa_changeV>D NA NA NA NA
## DSSPβ-bridge:aa_changeV>D NA NA NA NA
## DSSP310 Helix:aa_changeV>D NA NA NA NA
## DSSPÏ€-helix:aa_changeV>D NA NA NA NA
## DSSPβ-strand:aa_changeV>D NA NA NA NA
## DSSPTurn:aa_changeV>D NA NA NA NA
## DSSPα-helix:aa_changeV>E NA NA NA NA
## DSSPBend:aa_changeV>E NA NA NA NA
## DSSPβ-bridge:aa_changeV>E NA NA NA NA
## DSSP310 Helix:aa_changeV>E NA NA NA NA
## DSSPÏ€-helix:aa_changeV>E NA NA NA NA
## DSSPβ-strand:aa_changeV>E NA NA NA NA
## DSSPTurn:aa_changeV>E NA NA NA NA
## DSSPα-helix:aa_changeV>F NA NA NA NA
## DSSPBend:aa_changeV>F -4.426e-06 3.901e+05 0 1
## DSSPβ-bridge:aa_changeV>F NA NA NA NA
## DSSP310 Helix:aa_changeV>F NA NA NA NA
## DSSPÏ€-helix:aa_changeV>F NA NA NA NA
## DSSPβ-strand:aa_changeV>F NA NA NA NA
## DSSPTurn:aa_changeV>F NA NA NA NA
## DSSPα-helix:aa_changeV>G NA NA NA NA
## DSSPBend:aa_changeV>G NA NA NA NA
## DSSPβ-bridge:aa_changeV>G 1.727e-18 5.036e+05 0 1
## DSSP310 Helix:aa_changeV>G NA NA NA NA
## DSSPÏ€-helix:aa_changeV>G NA NA NA NA
## DSSPβ-strand:aa_changeV>G NA NA NA NA
## DSSPTurn:aa_changeV>G NA NA NA NA
## DSSPα-helix:aa_changeV>I NA NA NA NA
## DSSPBend:aa_changeV>I NA NA NA NA
## DSSPβ-bridge:aa_changeV>I NA NA NA NA
## DSSP310 Helix:aa_changeV>I NA NA NA NA
## DSSPÏ€-helix:aa_changeV>I NA NA NA NA
## DSSPβ-strand:aa_changeV>I 5.313e+01 3.982e+05 0 1
## DSSPTurn:aa_changeV>I NA NA NA NA
## DSSPα-helix:aa_changeV>L -4.420e-06 3.901e+05 0 1
## DSSPBend:aa_changeV>L 2.120e-07 2.980e+05 0 1
## DSSPβ-bridge:aa_changeV>L NA NA NA NA
## DSSP310 Helix:aa_changeV>L NA NA NA NA
## DSSPÏ€-helix:aa_changeV>L NA NA NA NA
## DSSPβ-strand:aa_changeV>L NA NA NA NA
## DSSPTurn:aa_changeV>L NA NA NA NA
## DSSPα-helix:aa_changeV>M NA NA NA NA
## DSSPBend:aa_changeV>M 4.419e-06 3.561e+05 0 1
## DSSPβ-bridge:aa_changeV>M NA NA NA NA
## DSSP310 Helix:aa_changeV>M NA NA NA NA
## DSSPÏ€-helix:aa_changeV>M NA NA NA NA
## DSSPβ-strand:aa_changeV>M NA NA NA NA
## DSSPTurn:aa_changeV>M NA NA NA NA
## DSSPα-helix:aa_changeW>C 5.439e-09 2.908e+05 0 1
## DSSPBend:aa_changeW>C 2.001e-07 2.908e+05 0 1
## DSSPβ-bridge:aa_changeW>C -4.637e-06 4.112e+05 0 1
## DSSP310 Helix:aa_changeW>C NA NA NA NA
## DSSPÏ€-helix:aa_changeW>C NA NA NA NA
## DSSPβ-strand:aa_changeW>C NA NA NA NA
## DSSPTurn:aa_changeW>C NA NA NA NA
## DSSPα-helix:aa_changeW>G NA NA NA NA
## DSSPBend:aa_changeW>G 4.434e-06 4.362e+05 0 1
## DSSPβ-bridge:aa_changeW>G NA NA NA NA
## DSSP310 Helix:aa_changeW>G NA NA NA NA
## DSSPÏ€-helix:aa_changeW>G NA NA NA NA
## DSSPβ-strand:aa_changeW>G NA NA NA NA
## DSSPTurn:aa_changeW>G NA NA NA NA
## DSSPα-helix:aa_changeW>L 4.415e-06 4.112e+05 0 1
## DSSPBend:aa_changeW>L 4.609e-06 5.036e+05 0 1
## DSSPβ-bridge:aa_changeW>L 4.410e-06 5.036e+05 0 1
## DSSP310 Helix:aa_changeW>L -6.715e-23 5.036e+05 0 1
## DSSPÏ€-helix:aa_changeW>L NA NA NA NA
## DSSPβ-strand:aa_changeW>L NA NA NA NA
## DSSPTurn:aa_changeW>L NA NA NA NA
## DSSPα-helix:aa_changeW>R -7.913e-17 3.982e+05 0 1
## DSSPBend:aa_changeW>R 1.334e-08 4.362e+05 0 1
## DSSPβ-bridge:aa_changeW>R 2.027e-07 5.036e+05 0 1
## DSSP310 Helix:aa_changeW>R NA NA NA NA
## DSSPÏ€-helix:aa_changeW>R NA NA NA NA
## DSSPβ-strand:aa_changeW>R 1.021e-08 3.901e+05 0 1
## DSSPTurn:aa_changeW>R NA NA NA NA
## DSSPα-helix:aa_changeW>S -4.410e-06 5.036e+05 0 1
## DSSPBend:aa_changeW>S NA NA NA NA
## DSSPβ-bridge:aa_changeW>S NA NA NA NA
## DSSP310 Helix:aa_changeW>S NA NA NA NA
## DSSPÏ€-helix:aa_changeW>S NA NA NA NA
## DSSPβ-strand:aa_changeW>S NA NA NA NA
## DSSPTurn:aa_changeW>S NA NA NA NA
## DSSPα-helix:aa_changeY>C 5.313e+01 4.362e+05 0 1
## DSSPBend:aa_changeY>C 5.313e+01 4.362e+05 0 1
## DSSPβ-bridge:aa_changeY>C NA NA NA NA
## DSSP310 Helix:aa_changeY>C NA NA NA NA
## DSSPÏ€-helix:aa_changeY>C 5.313e+01 4.362e+05 0 1
## DSSPβ-strand:aa_changeY>C 5.313e+01 3.561e+05 0 1
## DSSPTurn:aa_changeY>C NA NA NA NA
## DSSPα-helix:aa_changeY>D NA NA NA NA
## DSSPBend:aa_changeY>D -2.003e-07 4.362e+05 0 1
## DSSPβ-bridge:aa_changeY>D NA NA NA NA
## DSSP310 Helix:aa_changeY>D NA NA NA NA
## DSSPÏ€-helix:aa_changeY>D NA NA NA NA
## DSSPβ-strand:aa_changeY>D NA NA NA NA
## DSSPTurn:aa_changeY>D NA NA NA NA
## DSSPα-helix:aa_changeY>F NA NA NA NA
## DSSPBend:aa_changeY>F -2.250e-09 4.362e+05 0 1
## DSSPβ-bridge:aa_changeY>F NA NA NA NA
## DSSP310 Helix:aa_changeY>F NA NA NA NA
## DSSPÏ€-helix:aa_changeY>F NA NA NA NA
## DSSPβ-strand:aa_changeY>F NA NA NA NA
## DSSPTurn:aa_changeY>F NA NA NA NA
## DSSPα-helix:aa_changeY>H NA NA NA NA
## DSSPBend:aa_changeY>H 2.244e-29 5.036e+05 0 1
## DSSPβ-bridge:aa_changeY>H NA NA NA NA
## DSSP310 Helix:aa_changeY>H NA NA NA NA
## DSSPÏ€-helix:aa_changeY>H NA NA NA NA
## DSSPβ-strand:aa_changeY>H NA NA NA NA
## DSSPTurn:aa_changeY>H NA NA NA NA
## DSSPα-helix:aa_changeY>N 5.092e-29 4.362e+05 0 1
## DSSPBend:aa_changeY>N 5.683e-30 5.036e+05 0 1
## DSSPβ-bridge:aa_changeY>N NA NA NA NA
## DSSP310 Helix:aa_changeY>N NA NA NA NA
## DSSPÏ€-helix:aa_changeY>N NA NA NA NA
## DSSPβ-strand:aa_changeY>N NA NA NA NA
## DSSPTurn:aa_changeY>N NA NA NA NA
## DSSPα-helix:aa_changeY>S NA NA NA NA
## DSSPBend:aa_changeY>S 5.313e+01 5.036e+05 0 1
## DSSPβ-bridge:aa_changeY>S NA NA NA NA
## DSSP310 Helix:aa_changeY>S NA NA NA NA
## DSSPÏ€-helix:aa_changeY>S NA NA NA NA
## DSSPβ-strand:aa_changeY>S NA NA NA NA
## DSSPTurn:aa_changeY>S NA NA NA NA
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9.9393e+02 on 861 degrees of freedom
## Residual deviance: 5.0010e-09 on 435 degrees of freedom
## AIC: 854
##
## Number of Fisher Scoring iterations: 25
plot(model)
results_specific <- as.data.frame(summary(model)$coefficient)
results_specific$Odds_ratio <- exp(results_specific[["Estimate"]])
results_specific_sig <- results_specific %>%
dplyr::filter(`Pr(>|z|)` < 0.05) %>%
dplyr::arrange(Odds_ratio)
# No significant interactions
model_is_ss <- glm(is_ss ~ grantham_distance, data = structure_analysis_specific, family = binomial)
summary(model_is_ss)
##
## Call:
## glm(formula = is_ss ~ grantham_distance, family = binomial, data = structure_analysis_specific)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.710002 0.160841 4.414 1.01e-05 ***
## grantham_distance 0.003643 0.001648 2.210 0.0271 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.93 on 861 degrees of freedom
## Residual deviance: 988.94 on 860 degrees of freedom
## AIC: 992.94
##
## Number of Fisher Scoring iterations: 4
ggplot(structure_analysis_specific, aes(x = is_ss, y = grantham_distance)) +
geom_violin(alpha = 0.5) +
geom_jitter(position = position_jitter(seed = 1, width = 0.2), alpha = 0.5) +
theme_bw() +
xlab("Secondary structure") +
ylab("Grantham distance")
# Fit regression model
# Is the amino acid substitution important for the type of SS?
model_specific <- glm(DSSP ~ 0 + aa_change, data = structure_analysis_specific, family = binomial)
# Get summary of the model
summary(model_specific)
##
## Call:
## glm(formula = DSSP ~ 0 + aa_change, family = binomial, data = structure_analysis_specific)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## aa_changeA>D 1.857e+01 2.663e+03 0.007 0.99444
## aa_changeA>E -1.404e-16 6.325e-01 0.000 1.00000
## aa_changeA>G 2.877e-01 7.638e-01 0.377 0.70642
## aa_changeA>P 5.596e-01 6.268e-01 0.893 0.37194
## aa_changeA>S 9.808e-01 6.770e-01 1.449 0.14740
## aa_changeA>T 9.163e-01 8.367e-01 1.095 0.27344
## aa_changeA>V 1.609e+00 1.095e+00 1.469 0.14178
## aa_changeC>F 1.857e+01 3.766e+03 0.005 0.99607
## aa_changeC>R 6.931e-01 8.660e-01 0.800 0.42349
## aa_changeC>S 1.857e+01 4.612e+03 0.004 0.99679
## aa_changeC>W 1.857e+01 3.766e+03 0.005 0.99607
## aa_changeC>Y 1.857e+01 3.766e+03 0.005 0.99607
## aa_changeD>A 1.523e-17 8.165e-01 0.000 1.00000
## aa_changeD>E 1.504e+00 7.817e-01 1.924 0.05435 .
## aa_changeD>G 1.857e+01 3.261e+03 0.006 0.99546
## aa_changeD>H 4.055e-01 9.129e-01 0.444 0.65692
## aa_changeD>N 1.030e+00 5.210e-01 1.976 0.04812 *
## aa_changeD>V 1.946e+00 1.069e+00 1.820 0.06872 .
## aa_changeD>Y 1.386e+00 6.455e-01 2.148 0.03174 *
## aa_changeE>A 6.931e-01 1.225e+00 0.566 0.57143
## aa_changeE>D 1.946e+00 1.069e+00 1.820 0.06872 .
## aa_changeE>G 1.946e+00 1.069e+00 1.820 0.06872 .
## aa_changeE>K 1.540e+00 6.362e-01 2.421 0.01547 *
## aa_changeE>Q 1.099e+00 1.155e+00 0.951 0.34139
## aa_changeE>V 1.253e+00 8.018e-01 1.562 0.11818
## aa_changeF>C -1.857e+01 6.523e+03 -0.003 0.99773
## aa_changeF>I 1.857e+01 4.612e+03 0.004 0.99679
## aa_changeF>L 0.000e+00 8.165e-01 0.000 1.00000
## aa_changeF>V -1.901e-17 1.414e+00 0.000 1.00000
## aa_changeF>Y 1.857e+01 6.523e+03 0.003 0.99773
## aa_changeG>A 1.386e+00 1.118e+00 1.240 0.21500
## aa_changeG>C 1.299e+00 6.513e-01 1.995 0.04607 *
## aa_changeG>D 6.190e-01 4.688e-01 1.320 0.18668
## aa_changeG>E 1.609e+00 1.095e+00 1.469 0.14178
## aa_changeG>R 9.445e-01 4.454e-01 2.120 0.03398 *
## aa_changeG>S 1.386e+00 5.590e-01 2.480 0.01314 *
## aa_changeG>V 1.099e+00 4.082e-01 2.691 0.00712 **
## aa_changeG>W 6.931e-01 1.225e+00 0.566 0.57143
## aa_changeH>D 6.931e-01 1.225e+00 0.566 0.57143
## aa_changeH>L 6.931e-01 8.660e-01 0.800 0.42349
## aa_changeH>N 6.931e-01 1.225e+00 0.566 0.57143
## aa_changeH>P 9.163e-01 8.367e-01 1.095 0.27344
## aa_changeH>Q 5.108e-01 7.303e-01 0.699 0.48425
## aa_changeH>R -2.877e-01 7.638e-01 -0.377 0.70642
## aa_changeH>Y -4.055e-01 9.129e-01 -0.444 0.65692
## aa_changeI>F 1.792e+00 1.080e+00 1.659 0.09715 .
## aa_changeI>M -1.857e+01 6.523e+03 -0.003 0.99773
## aa_changeI>N 1.857e+01 2.306e+03 0.008 0.99358
## aa_changeI>T 1.857e+01 6.523e+03 0.003 0.99773
## aa_changeI>V 0.000e+00 1.000e+00 0.000 1.00000
## aa_changeK>E 1.386e+00 1.118e+00 1.240 0.21500
## aa_changeK>N 1.857e+01 3.766e+03 0.005 0.99607
## aa_changeK>Q 1.857e+01 4.612e+03 0.004 0.99679
## aa_changeL>F -3.140e-16 1.414e+00 0.000 1.00000
## aa_changeL>H 1.570e-16 1.414e+00 0.000 1.00000
## aa_changeL>I 6.931e-01 1.225e+00 0.566 0.57143
## aa_changeL>M 0.000e+00 8.165e-01 0.000 1.00000
## aa_changeL>P 1.540e+00 6.362e-01 2.421 0.01547 *
## aa_changeL>Q 6.931e-01 8.660e-01 0.800 0.42349
## aa_changeL>R 1.386e+00 7.906e-01 1.754 0.07951 .
## aa_changeL>V 6.931e-01 1.225e+00 0.566 0.57143
## aa_changeM>I 4.055e-01 9.129e-01 0.444 0.65692
## aa_changeM>K 1.099e+00 1.155e+00 0.951 0.34139
## aa_changeM>L 6.931e-01 1.225e+00 0.566 0.57143
## aa_changeM>R 4.055e-01 9.129e-01 0.444 0.65692
## aa_changeM>T 1.857e+01 6.523e+03 0.003 0.99773
## aa_changeM>V 1.857e+01 4.612e+03 0.004 0.99679
## aa_changeN>D 1.857e+01 3.261e+03 0.006 0.99546
## aa_changeN>H 0.000e+00 1.414e+00 0.000 1.00000
## aa_changeN>I 8.473e-01 6.901e-01 1.228 0.21950
## aa_changeN>K 9.163e-01 5.916e-01 1.549 0.12143
## aa_changeN>S 1.857e+01 3.766e+03 0.005 0.99607
## aa_changeN>T -6.931e-01 1.225e+00 -0.566 0.57143
## aa_changeN>Y 1.857e+01 2.663e+03 0.007 0.99444
## aa_changeP>A 0.000e+00 1.000e+00 0.000 1.00000
## aa_changeP>H -6.931e-01 1.225e+00 -0.566 0.57143
## aa_changeP>L 1.542e-01 5.563e-01 0.277 0.78172
## aa_changeP>Q -1.204e+00 6.583e-01 -1.829 0.06740 .
## aa_changeP>R -9.163e-01 8.367e-01 -1.095 0.27344
## aa_changeP>S 2.877e-01 7.638e-01 0.377 0.70642
## aa_changeP>T 1.823e-01 6.055e-01 0.301 0.76334
## aa_changeQ>E 1.857e+01 3.261e+03 0.006 0.99546
## aa_changeQ>H 1.609e+00 7.746e-01 2.078 0.03773 *
## aa_changeQ>K 0.000e+00 6.325e-01 0.000 1.00000
## aa_changeQ>L -1.857e+01 3.766e+03 -0.005 0.99607
## aa_changeQ>P 1.386e+00 1.118e+00 1.240 0.21500
## aa_changeQ>R 1.386e+00 1.118e+00 1.240 0.21500
## aa_changeR>C 1.386e+00 6.455e-01 2.148 0.03174 *
## aa_changeR>G 1.099e+00 8.165e-01 1.346 0.17846
## aa_changeR>H 1.609e+00 7.746e-01 2.078 0.03773 *
## aa_changeR>L 1.179e+00 5.718e-01 2.061 0.03926 *
## aa_changeR>P 1.609e+00 6.325e-01 2.545 0.01094 *
## aa_changeR>Q 1.857e+01 4.612e+03 0.004 0.99679
## aa_changeR>S 3.365e-01 5.855e-01 0.575 0.56554
## aa_changeR>W 1.857e+01 6.523e+03 0.003 0.99773
## aa_changeS>C 4.055e-01 9.129e-01 0.444 0.65692
## aa_changeS>F 1.857e+01 6.523e+03 0.003 0.99773
## aa_changeS>G -6.931e-01 1.225e+00 -0.566 0.57143
## aa_changeS>I -6.931e-01 1.225e+00 -0.566 0.57143
## aa_changeS>L 1.609e+00 1.095e+00 1.469 0.14178
## aa_changeS>N -5.551e-16 1.000e+00 0.000 1.00000
## aa_changeS>P 7.850e-17 7.071e-01 0.000 1.00000
## aa_changeS>R -1.823e-01 6.055e-01 -0.301 0.76334
## aa_changeS>T -1.857e+01 6.523e+03 -0.003 0.99773
## aa_changeS>W 1.857e+01 3.766e+03 0.005 0.99607
## aa_changeS>Y 1.857e+01 6.523e+03 0.003 0.99773
## aa_changeT>A 1.099e+00 1.155e+00 0.951 0.34139
## aa_changeT>I 1.857e+01 4.612e+03 0.004 0.99679
## aa_changeT>K 1.857e+01 3.261e+03 0.006 0.99546
## aa_changeT>M 1.857e+01 3.766e+03 0.005 0.99607
## aa_changeT>N -1.857e+01 6.523e+03 -0.003 0.99773
## aa_changeT>P 2.877e-01 7.638e-01 0.377 0.70642
## aa_changeT>R 1.857e+01 3.766e+03 0.005 0.99607
## aa_changeT>S 1.099e+00 1.155e+00 0.951 0.34139
## aa_changeV>A 1.857e+01 2.917e+03 0.006 0.99492
## aa_changeV>D 1.857e+01 2.465e+03 0.008 0.99399
## aa_changeV>E 1.857e+01 3.766e+03 0.005 0.99607
## aa_changeV>F 1.857e+01 2.663e+03 0.007 0.99444
## aa_changeV>G 1.857e+01 4.612e+03 0.004 0.99679
## aa_changeV>I 1.386e+00 1.118e+00 1.240 0.21500
## aa_changeV>L 1.857e+01 2.306e+03 0.008 0.99358
## aa_changeV>M 1.857e+01 3.261e+03 0.006 0.99546
## aa_changeW>C 1.857e+01 2.063e+03 0.009 0.99282
## aa_changeW>G 1.857e+01 3.766e+03 0.005 0.99607
## aa_changeW>L 1.857e+01 2.465e+03 0.008 0.99399
## aa_changeW>R 1.857e+01 1.809e+03 0.010 0.99181
## aa_changeW>S 1.857e+01 4.612e+03 0.004 0.99679
## aa_changeY>C 9.163e-01 8.367e-01 1.095 0.27344
## aa_changeY>D 1.857e+01 3.766e+03 0.005 0.99607
## aa_changeY>F 1.857e+01 3.766e+03 0.005 0.99607
## aa_changeY>H 1.857e+01 4.612e+03 0.004 0.99679
## aa_changeY>N 1.857e+01 3.261e+03 0.006 0.99546
## aa_changeY>S 0.000e+00 1.414e+00 0.000 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1195.0 on 862 degrees of freedom
## Residual deviance: 798.2 on 729 degrees of freedom
## AIC: 1064.2
##
## Number of Fisher Scoring iterations: 17
plot(model_specific)
results_specific <- as.data.frame(summary(model_specific)$coefficient)
rownames(results_specific) <- rownames(results_specific) %>% str_replace("aa_change", "")
results_specific$Odds_ratio <- exp(results_specific[["Estimate"]])
results_specific_sig <- results_specific %>%
dplyr::filter(`Pr(>|z|)` < 0.05) %>%
dplyr::arrange(Odds_ratio)
kableExtra::kbl(results_specific_sig, digits = 2)
| Estimate | Std. Error | z value | Pr(>|z|) | Odds_ratio | |
|---|---|---|---|---|---|
| G>R | 0.94 | 0.45 | 2.12 | 0.03 | 2.57 |
| D>N | 1.03 | 0.52 | 1.98 | 0.05 | 2.80 |
| G>V | 1.10 | 0.41 | 2.69 | 0.01 | 3.00 |
| R>L | 1.18 | 0.57 | 2.06 | 0.04 | 3.25 |
| G>C | 1.30 | 0.65 | 1.99 | 0.05 | 3.67 |
| G>S | 1.39 | 0.56 | 2.48 | 0.01 | 4.00 |
| D>Y | 1.39 | 0.65 | 2.15 | 0.03 | 4.00 |
| R>C | 1.39 | 0.65 | 2.15 | 0.03 | 4.00 |
| E>K | 1.54 | 0.64 | 2.42 | 0.02 | 4.67 |
| L>P | 1.54 | 0.64 | 2.42 | 0.02 | 4.67 |
| R>H | 1.61 | 0.77 | 2.08 | 0.04 | 5.00 |
| Q>H | 1.61 | 0.77 | 2.08 | 0.04 | 5.00 |
| R>P | 1.61 | 0.63 | 2.54 | 0.01 | 5.00 |
results_specific_sig
## Estimate Std. Error z value Pr(>|z|) Odds_ratio
## G>R 0.9444616 0.4454354 2.120311 0.033979822 2.571429
## D>N 1.0296194 0.5209881 1.976282 0.048122832 2.800000
## G>V 1.0986123 0.4082483 2.691040 0.007122975 3.000000
## R>L 1.1786550 0.5717719 2.061408 0.039264152 3.250000
## G>C 1.2992830 0.6513389 1.994788 0.046066029 3.666667
## G>S 1.3862944 0.5590170 2.479879 0.013142707 4.000000
## D>Y 1.3862944 0.6454972 2.147638 0.031742525 4.000000
## R>C 1.3862944 0.6454972 2.147638 0.031742525 4.000000
## E>K 1.5404450 0.6362090 2.421288 0.015465632 4.666667
## L>P 1.5404450 0.6362090 2.421288 0.015465632 4.666667
## R>H 1.6094379 0.7745967 2.077775 0.037730050 5.000000
## Q>H 1.6094379 0.7745967 2.077775 0.037730050 5.000000
## R>P 1.6094379 0.6324555 2.544745 0.010935764 5.000000
# Want to know whether we find more x to y substitutions at given features
ss_and_sub <- structure_analysis_specific %>%
group_by(aa_change, DSSP, `Ref A.A.`, `Alt A.A.`) %>%
tally()
# What is the composition of beta gal in terms of secondary structure
beta_gal_comp <- ss_df %>%
mutate_at(c("DSSP"), ~ replace_na(., "Coil")) %>%
group_by(DSSP) %>%
tally(name = "n_beta_gal") %>%
mutate(pct_beta_gal = round(100 * n_beta_gal / (sum(n_beta_gal)), digits = 1))
# What is the proportion of each secondary structure in terms of number of mutations recovered
mut_ss_comp <- structure_analysis_specific %>%
dplyr::group_by(DSSP) %>%
tally(name = "n_muts") %>%
mutate(pct_in_muts = round(100 * n_muts / (sum(n_muts)), digits = 1))
composition <- left_join(beta_gal_comp, mut_ss_comp, by = "DSSP")
contingency_table <- matrix(c(composition$n_beta_gal, composition$n_muts), nrow = nrow(composition))
result <- chisq.test(contingency_table)
print(result)
##
## Pearson's Chi-squared test
##
## data: contingency_table
## X-squared = 19.16, df = 7, p-value = 0.0077
# Create a vector of the letters
letters_set <- c(ss_and_sub$`Ref A.A.`, ss_and_sub$`Alt A.A.`) %>% unique()
# Generate all possible combinations
combinations <- expand.grid(`Ref A.A.` = letters_set, `Alt A.A.` = letters_set)
# Make data frame
boxes <- data.frame(
`Ref A.A.` = combinations[[1]],
`Alt A.A.` = combinations[[2]],
n = 0, check.names = F
)
max_n <- as.data.frame(ss_and_sub) %>%
dplyr::filter(n > 1) %>%
subset(!is.na(DSSP)) %>%
pull(n) %>%
max()
data_for_plot <- as.data.frame(ss_and_sub) %>%
subset(!is.na(DSSP)) %>%
group_by(DSSP) %>%
mutate(label = paste0(DSSP, ", n=", sum(n)))
data_for_plot$label <- factor(
data_for_plot$label,
levels(factor(data_for_plot$label))[c(3, 5, 1, 8, 7, 6, 4, 2)]
)
coil_specific <- results_specific_sig %>%
dplyr::filter(Odds_ratio < 1) %>%
rownames()
ss_specific <- results_specific_sig %>%
dplyr::filter(Odds_ratio > 1) %>%
rownames()
ggplot(
data_for_plot,
aes(x = `Ref A.A.`, y = `Alt A.A.`, fill = n)
) +
geom_tile(data = boxes, colour = "lightgrey", fill = "white", size = 0.05) +
geom_tile(color = "grey") +
# geom_tile(data = data_for_plot %>% dplyr::filter(aa_change %in% coil_specific), fill = NA, color = "red", size = 0.5) +
# geom_tile(data = data_for_plot %>% dplyr::filter(aa_change %in% ss_specific), fill = NA, color = "green", size = 0.5) +
facet_wrap2(~label, axes = "all", ncol = 4) +
coord_fixed() +
theme_bw() +
xlab("WT Residue") +
ylab("Mutant Residue") +
scale_fill_viridis_c(
option = "F",
name = "# Observations",
direction = -1,
trans = "log",
breaks = c(1, 5, 20, 50, max_n),
labels = c(1, 5, 20, 50, max_n)
) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
strip.background = element_blank(),
strip.text.x = element_text(size = rel(1.3)),
panel.border = element_rect(colour = "black", fill = NA))
ggplot(
data_for_plot,
aes(x = `Ref A.A.`, y = `Alt A.A.`, fill = n)
) +
geom_tile(data = boxes, colour = "lightgrey", fill = "white", size = 0.05) +
geom_tile(color = "grey") +
geom_tile(data = data_for_plot %>% dplyr::filter(aa_change %in% coil_specific), fill = NA, color = "red", size = 0.5) +
geom_tile(data = data_for_plot %>% dplyr::filter(aa_change %in% ss_specific), fill = NA, color = "green", size = 0.5) +
coord_fixed() +
theme_bw() +
xlab("WT Residue") +
ylab("Mutant Residue") +
scale_fill_viridis_c(
option = "F",
name = "# Observations",
direction = -1,
trans = "log",
breaks = c(1, 5, 20, 50, max_n),
labels = c(1, 5, 20, 50, max_n)
) +
theme(panel.grid.major = element_blank())
ggplot(
data_for_plot %>% dplyr::filter(aa_change %in% rownames(results_specific_sig)) %>%
dplyr::mutate(aa_change = factor(aa_change, levels = rownames(results_specific_sig))),
aes(x = DSSP, y = n)
) +
geom_col() +
facet_wrap(~aa_change, scales = "free_y") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
xlab("Secondary structure class") +
ylab("Observed substitutions")
structure_analysis %>%
dplyr::group_by(DSSP, ) %>%
tally()
## # A tibble: 8 × 2
## DSSP n
## <fct> <int>
## 1 Coil 227
## 2 α-helix 79
## 3 Bend 111
## 4 β-bridge 28
## 5 310 Helix 20
## 6 π-helix 15
## 7 β-strand 283
## 8 Turn 99
composition
## # A tibble: 8 × 5
## DSSP n_beta_gal pct_beta_gal n_muts pct_in_muts
## <chr> <int> <dbl> <int> <dbl>
## 1 310 Helix 32 3.2 20 2.3
## 2 Bend 106 10.5 111 12.9
## 3 Coil 230 22.7 227 26.3
## 4 Turn 118 11.7 99 11.5
## 5 α-helix 104 10.3 79 9.2
## 6 β-bridge 24 2.4 28 3.2
## 7 β-strand 392 38.8 283 32.8
## 8 π-helix 5 0.5 15 1.7
composition$DSSP <- factor(
composition$DSSP,
levels(factor(composition$DSSP))[c(3, 5, 1, 8, 7, 6, 4, 2)]
)
ggplot(
composition %>% pivot_longer(c(-DSSP, -n_beta_gal, -n_muts)),
aes(x = DSSP, y = value, fill = name)
) +
geom_col(position = "dodge") +
theme_bw() +
xlab("Secondary structure class") +
ylab("Proportion of residues or substitutions") +
scale_fill_discrete(name = element_blank(), labels = c("β-Gal Protein", "Mutation data")) +
theme(legend.position = "bottom")
# How many changes to proline?
structure_analysis %>% dplyr::filter(`Alt A.A.` == "P")
## # A tibble: 73 × 27
## # Rowwise:
## Position `Ref A.A.` `Alt A.A.` Consequence Type Codon aa_change PositionRef
## <dbl> <chr> <chr> <chr> <chr> <dbl> <chr> <dbl>
## 1 19 S P missense SNV 7 S>P 16
## 2 463 A P missense SNV 149 A>P 445
## 3 629 R P missense SNV 204 R>P 611
## 4 647 R P missense SNV 210 R>P 629
## 5 1016 R P missense SNV 333 R>P 998
## 6 1085 R P missense SNV 356 R>P 1067
## 7 1174 A P missense SNV 386 A>P 1156
## 8 1181 R P missense SNV 388 R>P 1163
## 9 1229 R P missense SNV 404 R>P 1211
## 10 1271 H P missense SNV 418 H>P 1253
## # ℹ 63 more rows
## # ℹ 19 more variables: residue_code <chr>, alt_code <chr>, CodonRef <dbl>,
## # residue_name <chr>, Domain <fct>, Buried.or.Exposed <chr>,
## # NetSurf.Amino.Acid <chr>, Probability.for.Alpha.Helix <dbl>,
## # Probability.for.Beta.strand <dbl>, Probability.for.Coil <dbl>,
## # secondary_structure <chr>, grantham_distance <dbl>, conservative <lgl>,
## # pdb_ss <fct>, Chain <chr>, ss_code <chr>, solvent_accessibility <dbl>, …
structure_analysis %>% dplyr::filter(`Alt A.A.` != "P")
## # A tibble: 789 × 27
## # Rowwise:
## Position `Ref A.A.` `Alt A.A.` Consequence Type Codon aa_change PositionRef
## <dbl> <chr> <chr> <chr> <chr> <dbl> <chr> <dbl>
## 1 41 P H missense SNV 8 P>H 23
## 2 51 L F missense SNV 11 L>F 33
## 3 109 P T missense SNV 31 P>T 91
## 4 110 P L missense SNV 31 P>L 92
## 5 112 P T missense SNV 32 P>T 94
## 6 122 S I missense SNV 35 S>I 104
## 7 130 N Y missense SNV 38 N>Y 112
## 8 132 N K missense SNV 38 N>K 114
## 9 138 E D missense SNV 40 E>D 120
## 10 149 T S missense SNV 44 T>S 131
## # ℹ 779 more rows
## # ℹ 19 more variables: residue_code <chr>, alt_code <chr>, CodonRef <dbl>,
## # residue_name <chr>, Domain <fct>, Buried.or.Exposed <chr>,
## # NetSurf.Amino.Acid <chr>, Probability.for.Alpha.Helix <dbl>,
## # Probability.for.Beta.strand <dbl>, Probability.for.Coil <dbl>,
## # secondary_structure <chr>, grantham_distance <dbl>, conservative <lgl>,
## # pdb_ss <fct>, Chain <chr>, ss_code <chr>, solvent_accessibility <dbl>, …
# What are the
betagal_proportions <- as.data.frame(t(alphabetFrequency(laczref_aa))) %>%
tibble::rownames_to_column() %>%
dplyr::rename(
"aa" = "rowname",
"n_betagal" = "V1"
)
wts <- structure_analysis %>%
group_by(`Ref A.A.`) %>%
tally() %>%
dplyr::rename("aa" = `Ref A.A.`, n_wt = n)
subs <- structure_analysis %>%
group_by(`Alt A.A.`) %>%
tally() %>%
dplyr::rename("aa" = `Alt A.A.`, n_mut = n)
aa_proportions_wt_and_muts <- left_join(betagal_proportions, wts) %>%
left_join(subs) %>%
na.omit() %>%
mutate(pct_betagal = round(100 * n_betagal / (sum(n_betagal)), digits = 1)) %>%
mutate(pct_wt = round(100 * n_wt / (sum(n_wt)), digits = 1)) %>%
mutate(pct_mut = round(100 * n_mut / (sum(n_mut)), digits = 1)) %>%
pivot_longer(cols = c(5:7), names_to = "source", values_to = "Percent of residues") %>%
pivot_longer(cols = c(2:4), names_to = "source_count", values_to = "n")
aa_proportions_wt_and_muts$source <- factor(aa_proportions_wt_and_muts$source, levels = c("pct_betagal", "pct_wt", "pct_mut"))
ggplot(aa_proportions_wt_and_muts, aes(x = aa, y = `Percent of residues`, fill = source)) +
geom_col(position = "dodge") +
scale_fill_discrete(name = element_blank(), labels = c("β-Gal Protein", "Wild type residues", "Mutated residues")) +
theme_bw() +
facet_grid(~aa, scales = "free_x") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) +
xlab("Amino acid")
structure_analysis_specific %>%
group_by(aa_change) %>%
tally() %>%
arrange(-n)
## # A tibble: 133 × 2
## aa_change n
## <fct> <int>
## 1 G>V 32
## 2 G>R 25
## 3 G>D 20
## 4 G>S 20
## 5 D>N 19
## 6 R>P 18
## 7 E>K 17
## 8 L>P 17
## 9 R>L 17
## 10 D>Y 15
## # ℹ 123 more rows
hydropathy <- idpr::KDNorm %>% dplyr::rename("hydropathy" = "V2")
aa_classes <- read.table("data/raw/physicochemical_classes.txt",
sep = "\t",
header = T
)
aa_classes_cleaned <- aa_classes %>%
pivot_longer(-Amino.acid, names_to = "category") %>%
separate_rows(value, sep = "\\(") %>%
mutate(value = str_remove(value, "[()]")) %>%
mutate(value = trimws(value)) %>%
group_by(Amino.acid, category) %>%
mutate(row_num = row_number()) %>%
pivot_wider(names_from = category, values_from = value) %>%
pivot_wider(names_from = row_num, values_from = c(3:9))
colnames(aa_classes_cleaned) <- colnames(aa_classes_cleaned) %>% str_remove("_1")
colnames(aa_classes_cleaned) <- colnames(aa_classes_cleaned) %>% str_replace_all("_2", " score")
structure_analysis_physicochemical <- structure_analysis_specific %>%
left_join(hydropathy, by = c("Ref A.A." = "V1"), suffix = c("", ".WT")) %>%
left_join(hydropathy, by = c("Alt A.A." = "V1"), suffix = c("", ".MUT")) %>%
left_join(aa_classes_cleaned, by = c("Ref A.A." = "Amino.acid"), suffix = c("", ".WT")) %>%
left_join(aa_classes_cleaned, by = c("Alt A.A." = "Amino.acid"), suffix = c("", ".MUT")) %>%
mutate(across(contains("score"), as.numeric)) %>%
mutate(
Hydropathy_diff = `hydropathy.MUT` - `hydropathy`,
Volume_diff = `Volume score.MUT` - `Volume score`,
Chemical_diff = `Chemical score.MUT` - `Chemical score`,
Physicochemical_diff = `Physicochemical score.MUT` - `Physicochemical score`,
Charge_diff = `Charge score.MUT` - `Charge score`,
Polarity_diff = `Polarity score.MUT` - `Polarity score`
)
# Model the Volume_diff
model_vol <- glm(grantham_distance ~ Volume_diff, data = structure_analysis_physicochemical, family = quasipoisson)
summary(model_vol)
##
## Call:
## glm(formula = grantham_distance ~ Volume_diff, family = quasipoisson,
## data = structure_analysis_physicochemical)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.49169 0.01865 240.870 <2e-16 ***
## Volume_diff 0.00826 0.01095 0.754 0.451
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 25.9943)
##
## Null deviance: 23402 on 861 degrees of freedom
## Residual deviance: 23387 on 860 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 5
ggplot(structure_analysis_physicochemical, aes(x = Volume_diff, y = grantham_distance)) +
geom_jitter()
# this result is not interesting or surprising because the Grantham distance takes into account the physicochemical properties of the residue change.
# Model the Hydropathy_diff
model_hydro <- glm(grantham_distance ~ Hydropathy_diff, data = structure_analysis_physicochemical, family = quasipoisson)
summary(model_hydro)
##
## Call:
## glm(formula = grantham_distance ~ Hydropathy_diff, family = quasipoisson,
## data = structure_analysis_physicochemical)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.48696 0.01834 244.612 < 2e-16 ***
## Hydropathy_diff 0.16635 0.04232 3.931 9.13e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 25.48792)
##
## Null deviance: 23402 on 861 degrees of freedom
## Residual deviance: 23008 on 860 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 5
ggplot(structure_analysis_physicochemical, aes(x = Hydropathy_diff, y = grantham_distance)) +
geom_jitter()
# this result is not interesting or surprising because the Grantham distance takes into account the physicochemical properties of the residue change.
# What about a relation between physiochemical properties and the ability to disrupt specific types of secondary structure?
# Hydropathy
model_ss_hydro <- glm(DSSP ~ Hydropathy_diff, data = structure_analysis_physicochemical, family = binomial)
summary(model_ss_hydro)
##
## Call:
## glm(formula = DSSP ~ Hydropathy_diff, family = binomial, data = structure_analysis_physicochemical)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.02986 0.07753 13.283 <2e-16 ***
## Hydropathy_diff -0.04055 0.18014 -0.225 0.822
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.93 on 861 degrees of freedom
## Residual deviance: 993.87 on 860 degrees of freedom
## AIC: 997.87
##
## Number of Fisher Scoring iterations: 4
ggplot(structure_analysis_physicochemical, aes(x = DSSP, y = Hydropathy_diff)) +
geom_violin(alpha = 0.5) +
geom_jitter(position = position_jitter(w = 0.2, h = 0.25), alpha = 0.5) +
theme_bw() +
xlab("Secondary structure class") +
ylab("Relative hydropathy (mutation:wild type)")
# Side chain volume
model_ss_vol <- glm(DSSP ~ Volume_diff, data = structure_analysis_physicochemical, family = binomial)
summary(model_ss_vol)
##
## Call:
## glm(formula = DSSP ~ Volume_diff, family = binomial, data = structure_analysis_physicochemical)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.06826 0.08012 13.333 <2e-16 ***
## Volume_diff -0.11272 0.04690 -2.403 0.0163 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.93 on 861 degrees of freedom
## Residual deviance: 988.06 on 860 degrees of freedom
## AIC: 992.06
##
## Number of Fisher Scoring iterations: 4
ggplot(structure_analysis_physicochemical, aes(x = DSSP, y = Volume_diff)) +
geom_violin(alpha = 0.5) +
geom_jitter(position = position_jitter(w = 0.2, h = 0.25), alpha = 0.5) +
theme_bw() +
xlab("Secondary structure class") +
ylab("Relative side chain volume (mutation:wild type)")
print(alignment, show = "complete")
##
## MsaDNAMultipleAlignment with 2 rows and 3096 columns
## aln (1..54) names
## [1] ---ACCATGATTACGGATTCACTGG---------------CCGTCGTTTTACAA V00296.1 E. coli ...
## [2] ATGACCATGATTACGGATTCACTGGAATTCCCGGGGATCCCCGTCGTTTTACAA lacZ
## Con ???ACCATGATTACGGATTCACTGG???????????????CCGTCGTTTTACAA Consensus
##
## aln (55..108) names
## [1] CGTCGTGACTGGGAAAACCCTGGCGTTACCCAACTTAATCGCCTTGCAGCACAT V00296.1 E. coli ...
## [2] CGTCGTGACTGGGAAAACCCTGGCGTTACCCAACTTAATCGCCTTGCAGCACAT lacZ
## Con CGTCGTGACTGGGAAAACCCTGGCGTTACCCAACTTAATCGCCTTGCAGCACAT Consensus
##
## aln (109..162) names
## [1] CCCCCTTTCGCCAGCTGGCGTAATAGCGAAGAGGCCCGCACCGATCGCCCTTCC V00296.1 E. coli ...
## [2] CCCCCTTTCGCCAGCTGGCGTAATAGCGAAGAGGCCCGCACCGATCGCCCTTCC lacZ
## Con CCCCCTTTCGCCAGCTGGCGTAATAGCGAAGAGGCCCGCACCGATCGCCCTTCC Consensus
##
## aln (163..216) names
## [1] CAACAGTTGCGCAGCCTGAATGGCGAATGGCGCTTTGCCTGGTTTCCGGCACCA V00296.1 E. coli ...
## [2] CAACAGTTGCGCAGCCTGAATGGCGAATGGCGCTTTGCCTGGTTTCCGGCACCA lacZ
## Con CAACAGTTGCGCAGCCTGAATGGCGAATGGCGCTTTGCCTGGTTTCCGGCACCA Consensus
##
## aln (217..270) names
## [1] GAAGCGGTGCCGGAAAGCTGGCTGGAGTGCGATCTTCCTGAGGCCGATACTGTC V00296.1 E. coli ...
## [2] GAAGCGGTGCCGGAAAGCTGGCTGGAGTGCGATCTTCCTGAGGCCGATACTGTC lacZ
## Con GAAGCGGTGCCGGAAAGCTGGCTGGAGTGCGATCTTCCTGAGGCCGATACTGTC Consensus
##
## aln (271..324) names
## [1] GTCGTCCCCTCAAACTGGCAGATGCACGGTTACGATGCGCCCATCTACACCAAC V00296.1 E. coli ...
## [2] GTCGTCCCCTCAAACTGGCAGATGCACGGTTACGATGCGCCCATCTACACCAAC lacZ
## Con GTCGTCCCCTCAAACTGGCAGATGCACGGTTACGATGCGCCCATCTACACCAAC Consensus
##
## aln (325..378) names
## [1] GTAACCTATCCCATTACGGTCAATCCGCCGTTTGTTCCCACGGAGAATCCGACG V00296.1 E. coli ...
## [2] GTGACCTATCCCATTACGGTCAATCCGCCGTTTGTTCCCACGGAGAATCCGACG lacZ
## Con GT?ACCTATCCCATTACGGTCAATCCGCCGTTTGTTCCCACGGAGAATCCGACG Consensus
##
## aln (379..432) names
## [1] GGTTGTTACTCGCTCACATTTAATGTTGATGAAAGCTGGCTACAGGAAGGCCAG V00296.1 E. coli ...
## [2] GGTTGTTACTCGCTCACATTTAATGTTGATGAAAGCTGGCTACAGGAAGGCCAG lacZ
## Con GGTTGTTACTCGCTCACATTTAATGTTGATGAAAGCTGGCTACAGGAAGGCCAG Consensus
##
## aln (433..486) names
## [1] ACGCGAATTATTTTTGATGGCGTTAACTCGGCGTTTCATCTGTGGTGCAACGGG V00296.1 E. coli ...
## [2] ACGCGAATTATTTTTGATGGCGTTAACTCGGCGTTTCATCTGTGGTGCAACGGG lacZ
## Con ACGCGAATTATTTTTGATGGCGTTAACTCGGCGTTTCATCTGTGGTGCAACGGG Consensus
##
## aln (487..540) names
## [1] CGCTGGGTCGGTTACGGCCAGGACAGTCGTTTGCCGTCTGAATTTGACCTGAGC V00296.1 E. coli ...
## [2] CGCTGGGTCGGTTACGGCCAGGACAGTCGTTTGCCGTCTGAATTTGACCTGAGC lacZ
## Con CGCTGGGTCGGTTACGGCCAGGACAGTCGTTTGCCGTCTGAATTTGACCTGAGC Consensus
##
## aln (541..594) names
## [1] GCATTTTTACGCGCCGGAGAAAACCGCCTCGCGGTGATGGTGCTGCGTTGGAGT V00296.1 E. coli ...
## [2] GCATTTTTACGCGCCGGAGAAAACCGCCTCGCGGTGATGGTGCTGCGCTGGAGT lacZ
## Con GCATTTTTACGCGCCGGAGAAAACCGCCTCGCGGTGATGGTGCTGCG?TGGAGT Consensus
##
## aln (595..648) names
## [1] GACGGCAGTTATCTGGAAGATCAGGATATGTGGCGGATGAGCGGCATTTTCCGT V00296.1 E. coli ...
## [2] GACGGCAGTTATCTGGAAGATCAGGATATGTGGCGGATGAGCGGCATTTTCCGT lacZ
## Con GACGGCAGTTATCTGGAAGATCAGGATATGTGGCGGATGAGCGGCATTTTCCGT Consensus
##
## aln (649..702) names
## [1] GACGTCTCGTTGCTGCATAAACCGACTACACAAATCAGCGATTTCCATGTTGCC V00296.1 E. coli ...
## [2] GACGTCTCGTTGCTGCATAAACCGACTACACAAATCAGCGATTTCCATGTTGCC lacZ
## Con GACGTCTCGTTGCTGCATAAACCGACTACACAAATCAGCGATTTCCATGTTGCC Consensus
##
## aln (703..756) names
## [1] ACTCGCTTTAATGATGATTTCAGCCGCGCTGTACTGGAGGCTGAAGTTCAGATG V00296.1 E. coli ...
## [2] ACTCGCTTTAATGATGATTTCAGCCGCGCTGTACTGGAGGCTGAAGTTCAGATG lacZ
## Con ACTCGCTTTAATGATGATTTCAGCCGCGCTGTACTGGAGGCTGAAGTTCAGATG Consensus
##
## aln (757..810) names
## [1] TGCGGCGAGTTGCGTGACTACCTACGGGTAACAGTTTCTTTATGGCAGGGTGAA V00296.1 E. coli ...
## [2] TGCGGCGAGTTGCGTGACTACCTACGGGTAACAGTTTCTTTATGGCAGGGTGAA lacZ
## Con TGCGGCGAGTTGCGTGACTACCTACGGGTAACAGTTTCTTTATGGCAGGGTGAA Consensus
##
## aln (811..864) names
## [1] ACGCAGGTCGCCAGCGGCACCGCGCCTTTCGGCGGTGAAATTATCGATGAGCGT V00296.1 E. coli ...
## [2] ACGCAGGTCGCCAGCGGCACCGCGCCTTTCGGCGGTGAAATTATCGATGAGCGT lacZ
## Con ACGCAGGTCGCCAGCGGCACCGCGCCTTTCGGCGGTGAAATTATCGATGAGCGT Consensus
##
## aln (865..918) names
## [1] GGTGGTTATGCCGATCGCGTCACACTACGTCTGAACGTCGAAAACCCGAAACTG V00296.1 E. coli ...
## [2] GGTGGTTATGCCGATCGCGTCACACTACGTCTGAACGTCGAAAACCCGAAACTG lacZ
## Con GGTGGTTATGCCGATCGCGTCACACTACGTCTGAACGTCGAAAACCCGAAACTG Consensus
##
## aln (919..972) names
## [1] TGGAGCGCCGAAATCCCGAATCTCTATCGTGCGGTGGTTGAACTGCACACCGCC V00296.1 E. coli ...
## [2] TGGAGCGCCGAAATCCCGAATCTCTATCGTGCGGTGGTTGAACTGCACACCGCC lacZ
## Con TGGAGCGCCGAAATCCCGAATCTCTATCGTGCGGTGGTTGAACTGCACACCGCC Consensus
##
## aln (973..1026) names
## [1] GACGGCACGCTGATTGAAGCAGAAGCCTGCGATGTCGGTTTCCGCGAGGTGCGG V00296.1 E. coli ...
## [2] GACGGCACGCTGATTGAAGCAGAAGCCTGCGATGTCGGTTTCCGCGAGGTGCGG lacZ
## Con GACGGCACGCTGATTGAAGCAGAAGCCTGCGATGTCGGTTTCCGCGAGGTGCGG Consensus
##
## aln (1027..1080) names
## [1] ATTGAAAATGGTCTGCTGCTGCTGAACGGCAAGCCGTTGCTGATTCGAGGCGTT V00296.1 E. coli ...
## [2] ATTGAAAATGGTCTGCTGCTGCTGAACGGCAAGCCGTTGCTGATTCGAGGCGTT lacZ
## Con ATTGAAAATGGTCTGCTGCTGCTGAACGGCAAGCCGTTGCTGATTCGAGGCGTT Consensus
##
## aln (1081..1134) names
## [1] AACCGTCACGAGCATCATCCTCTGCATGGTCAGGTCATGGATGAGCAGACGATG V00296.1 E. coli ...
## [2] AACCGTCACGAGCATCATCCTCTGCATGGTCAGGTCATGGATGAGCAGACGATG lacZ
## Con AACCGTCACGAGCATCATCCTCTGCATGGTCAGGTCATGGATGAGCAGACGATG Consensus
##
## aln (1135..1188) names
## [1] GTGCAGGATATCCTGCTGATGAAGCAGAACAACTTTAACGCCGTGCGCTGTTCG V00296.1 E. coli ...
## [2] GTGCAGGATATCCTGCTGATGAAGCAGAACAACTTTAACGCCGTGCGCTGTTCG lacZ
## Con GTGCAGGATATCCTGCTGATGAAGCAGAACAACTTTAACGCCGTGCGCTGTTCG Consensus
##
## aln (1189..1242) names
## [1] CATTATCCGAACCATCCGCTGTGGTACACGCTGTGCGACCGCTACGGCCTGTAT V00296.1 E. coli ...
## [2] CATTATCCGAACCATCCGCTGTGGTACACGCTGTGCGACCGCTACGGCCTGTAT lacZ
## Con CATTATCCGAACCATCCGCTGTGGTACACGCTGTGCGACCGCTACGGCCTGTAT Consensus
##
## aln (1243..1296) names
## [1] GTGGTGGATGAAGCCAATATTGAAACCCACGGCATGGTGCCAATGAATCGTCTG V00296.1 E. coli ...
## [2] GTGGTGGATGAAGCCAATATTGAAACCCACGGCATGGTGCCAATGAATCGTCTG lacZ
## Con GTGGTGGATGAAGCCAATATTGAAACCCACGGCATGGTGCCAATGAATCGTCTG Consensus
##
## aln (1297..1350) names
## [1] ACCGATGATCCGCGCTGGCTACCGGCGATGAGCGAACGCGTAACGCGAATGGTG V00296.1 E. coli ...
## [2] ACCGATGATCCGCGCTGGCTACCGGCGATGAGCGAACGCGTAACGCGAATGGTG lacZ
## Con ACCGATGATCCGCGCTGGCTACCGGCGATGAGCGAACGCGTAACGCGAATGGTG Consensus
##
## aln (1351..1404) names
## [1] CAGCGCGATCGTAATCACCCGAGTGTGATCATCTGGTCGCTGGGGAATGAATCA V00296.1 E. coli ...
## [2] CAGCGCGATCGTAATCACCCGAGTGTGATCATCTGGTCGCTGGGGAATGAATCA lacZ
## Con CAGCGCGATCGTAATCACCCGAGTGTGATCATCTGGTCGCTGGGGAATGAATCA Consensus
##
## aln (1405..1458) names
## [1] GGCCACGGCGCTAATCACGACGCGCTGTATCGCTGGATCAAATCTGTCGATCCT V00296.1 E. coli ...
## [2] GGCCACGGCGCTAATCACGACGCGCTGTATCGCTGGATCAAATCTGTCGATCCT lacZ
## Con GGCCACGGCGCTAATCACGACGCGCTGTATCGCTGGATCAAATCTGTCGATCCT Consensus
##
## aln (1459..1512) names
## [1] TCCCGCCCGGTGCAGTATGAAGGCGGCGGAGCCGACACCACGGCCACCGATATT V00296.1 E. coli ...
## [2] TCCCGCCCGGTGCAGTATGAAGGCGGCGGAGCCGACACCACGGCCACCGATATT lacZ
## Con TCCCGCCCGGTGCAGTATGAAGGCGGCGGAGCCGACACCACGGCCACCGATATT Consensus
##
## aln (1513..1566) names
## [1] ATTTGCCCGATGTACGCGCGCGTGGATGAAGACCAGCCCTTCCCGGCTGTGCCG V00296.1 E. coli ...
## [2] ATTTGCCCGATGTACGCGCGCGTGGATGAAGACCAGCCCTTCCCGGCTGTGCCG lacZ
## Con ATTTGCCCGATGTACGCGCGCGTGGATGAAGACCAGCCCTTCCCGGCTGTGCCG Consensus
##
## aln (1567..1620) names
## [1] AAATGGTCCATCAAAAAATGGCTTTCGCTACCTGGAGAGACGCGCCCGCTGATC V00296.1 E. coli ...
## [2] AAATGGTCCATCAAAAAATGGCTTTCGCTACCTGGAGAGACGCGCCCGCTGATC lacZ
## Con AAATGGTCCATCAAAAAATGGCTTTCGCTACCTGGAGAGACGCGCCCGCTGATC Consensus
##
## aln (1621..1674) names
## [1] CTTTGCGAATACGCCCACGCGATGGGTAACAGTCTTGGCGGTTTCGCTAAATAC V00296.1 E. coli ...
## [2] CTTTGCGAATACGCCCACGCGATGGGTAACAGTCTTGGCGGTTTCGCTAAATAC lacZ
## Con CTTTGCGAATACGCCCACGCGATGGGTAACAGTCTTGGCGGTTTCGCTAAATAC Consensus
##
## aln (1675..1728) names
## [1] TGGCAGGCGTTTCGTCAGTATCCCCGTTTACAGGGCGGCTTCGTCTGGGACTGG V00296.1 E. coli ...
## [2] TGGCAGGCGTTTCGTCAGTATCCCCGTTTACAGGGCGGCTTCGTCTGGGACTGG lacZ
## Con TGGCAGGCGTTTCGTCAGTATCCCCGTTTACAGGGCGGCTTCGTCTGGGACTGG Consensus
##
## aln (1729..1782) names
## [1] GTGGATCAGTCGCTGATTAAATATGATGAAAACGGCAACCCGTGGTCGGCTTAC V00296.1 E. coli ...
## [2] GTGGATCAGTCGCTGATTAAATATGATGAAAACGGCAACCCGTGGTCGGCTTAC lacZ
## Con GTGGATCAGTCGCTGATTAAATATGATGAAAACGGCAACCCGTGGTCGGCTTAC Consensus
##
## aln (1783..1836) names
## [1] GGCGGTGATTTTGGCGATACGCCGAACGATCGCCAGTTCTGTATGAACGGTCTG V00296.1 E. coli ...
## [2] GGCGGTGATTTTGGCGATACGCCGAACGATCGCCAGTTCTGTATGAACGGTCTG lacZ
## Con GGCGGTGATTTTGGCGATACGCCGAACGATCGCCAGTTCTGTATGAACGGTCTG Consensus
##
## aln (1837..1890) names
## [1] GTCTTTGCCGACCGCACGCCGCATCCAGCGCTGACGGAAGCAAAACACCAGCAG V00296.1 E. coli ...
## [2] GTCTTTGCCGACCGCACGCCGCATCCAGCGCTGACGGAAGCAAAACACCAGCAG lacZ
## Con GTCTTTGCCGACCGCACGCCGCATCCAGCGCTGACGGAAGCAAAACACCAGCAG Consensus
##
## aln (1891..1944) names
## [1] CAGTTTTTCCAGTTCCGTTTATCCGGGCAAACCATCGAAGTGACCAGCGAATAC V00296.1 E. coli ...
## [2] CAGTTTTTCCAGTTCCGTTTATCCGGGCAAACCATCGAAGTGACCAGCGAATAC lacZ
## Con CAGTTTTTCCAGTTCCGTTTATCCGGGCAAACCATCGAAGTGACCAGCGAATAC Consensus
##
## aln (1945..1998) names
## [1] CTGTTCCGTCATAGCGATAACGAGCTCCTGCACTGGATGGTGGCGCTGGATGGT V00296.1 E. coli ...
## [2] CTGTTCCGTCATAGCGATAACGAGCTCCTGCACTGGATGGTGGCGCTGGATGGT lacZ
## Con CTGTTCCGTCATAGCGATAACGAGCTCCTGCACTGGATGGTGGCGCTGGATGGT Consensus
##
## aln (1999..2052) names
## [1] AAGCCGCTGGCAAGCGGTGAAGTGCCTCTGGATGTCGCTCCACAAGGTAAACAG V00296.1 E. coli ...
## [2] AAGCCGCTGGCAAGCGGTGAAGTGCCTCTGGATGTCGCTCCACAAGGTAAACAG lacZ
## Con AAGCCGCTGGCAAGCGGTGAAGTGCCTCTGGATGTCGCTCCACAAGGTAAACAG Consensus
##
## aln (2053..2106) names
## [1] TTGATTGAACTGCCTGAACTACCGCAGCCGGAGAGCGCCGGGCAACTCTGGCTC V00296.1 E. coli ...
## [2] TTGATTGAACTGCCTGAACTACCGCAGCCGGAGAGCGCCGGGCAACTCTGGCTC lacZ
## Con TTGATTGAACTGCCTGAACTACCGCAGCCGGAGAGCGCCGGGCAACTCTGGCTC Consensus
##
## aln (2107..2160) names
## [1] ACAGTACGCGTAGTGCAACCGAACGCGACCGCATGGTCAGAAGCCGGGCACATC V00296.1 E. coli ...
## [2] ACAGTACGCGTAGTGCAACCGAACGCGACCGCATGGTCAGAAGCCGGGCACATC lacZ
## Con ACAGTACGCGTAGTGCAACCGAACGCGACCGCATGGTCAGAAGCCGGGCACATC Consensus
##
## aln (2161..2214) names
## [1] AGCGCCTGGCAGCAGTGGCGTCTGGCGGAAAACCTCAGTGTGACGCTCCCCGCC V00296.1 E. coli ...
## [2] AGCGCCTGGCAGCAGTGGCGTCTGGCGGAAAACCTCAGTGTGACGCTCCCCGCC lacZ
## Con AGCGCCTGGCAGCAGTGGCGTCTGGCGGAAAACCTCAGTGTGACGCTCCCCGCC Consensus
##
## aln (2215..2268) names
## [1] GCGTCCCACGCCATCCCGCATCTGACCACCAGCGAAATGGATTTTTGCATCGAG V00296.1 E. coli ...
## [2] GCGTCCCACGCCATCCCGCATCTGACCACCAGCGAAATGGATTTTTGCATCGAG lacZ
## Con GCGTCCCACGCCATCCCGCATCTGACCACCAGCGAAATGGATTTTTGCATCGAG Consensus
##
## aln (2269..2322) names
## [1] CTGGGTAATAAGCGTTGGCAATTTAACCGCCAGTCAGGCTTTCTTTCACAGATG V00296.1 E. coli ...
## [2] CTGGGTAATAAGCGTTGGCAATTTAACCGCCAGTCAGGCTTTCTTTCACAGATG lacZ
## Con CTGGGTAATAAGCGTTGGCAATTTAACCGCCAGTCAGGCTTTCTTTCACAGATG Consensus
##
## aln (2323..2376) names
## [1] TGGATTGGCGATAAAAAACAACTGCTGACGCCGCTGCGCGATCAGTTCACCCGT V00296.1 E. coli ...
## [2] TGGATTGGCGATAAAAAACAACTGCTGACGCCGCTGCGCGATCAGTTCACCCGT lacZ
## Con TGGATTGGCGATAAAAAACAACTGCTGACGCCGCTGCGCGATCAGTTCACCCGT Consensus
##
## aln (2377..2430) names
## [1] GCACCGCTGGATAACGACATTGGCGTAAGTGAAGCGACCCGCATTGACCCTAAC V00296.1 E. coli ...
## [2] GCACCGCTGGATAACGACATTGGCGTAAGTGAAGCGACCCGCATTGACCCTAAC lacZ
## Con GCACCGCTGGATAACGACATTGGCGTAAGTGAAGCGACCCGCATTGACCCTAAC Consensus
##
## aln (2431..2484) names
## [1] GCCTGGGTCGAACGCTGGAAGGCGGCGGGCCATTACCAGGCCGAAGCAGCGTTG V00296.1 E. coli ...
## [2] GCCTGGGTCGAACGCTGGAAGGCGGCGGGCCATTACCAGGCCGAAGCAGCGTTG lacZ
## Con GCCTGGGTCGAACGCTGGAAGGCGGCGGGCCATTACCAGGCCGAAGCAGCGTTG Consensus
##
## aln (2485..2538) names
## [1] TTGCAGTGCACGGCAGATACACTTGCTGATGCGGTGCTGATTACGACCGCTCAC V00296.1 E. coli ...
## [2] TTGCAGTGCACGGCAGATACACTTGCTGATGCGGTGCTGATTACGACCGCTCAC lacZ
## Con TTGCAGTGCACGGCAGATACACTTGCTGATGCGGTGCTGATTACGACCGCTCAC Consensus
##
## aln (2539..2592) names
## [1] GCGTGGCAGCATCAGGGGAAAACCTTATTTATCAGCCGGAAAACCTACCGGATT V00296.1 E. coli ...
## [2] GCGTGGCAGCATCAGGGGAAAACCTTATTTATCAGCCGGAAAACCTACCGGATT lacZ
## Con GCGTGGCAGCATCAGGGGAAAACCTTATTTATCAGCCGGAAAACCTACCGGATT Consensus
##
## aln (2593..2646) names
## [1] GATGGTAGTGGTCAAATGGCGATTACCGTTGATGTTGAAGTGGCGAGCGATACA V00296.1 E. coli ...
## [2] GATGGTAGTGGTCAAATGGCGATTACCGTTGATGTTGAAGTGGCGAGCGATACA lacZ
## Con GATGGTAGTGGTCAAATGGCGATTACCGTTGATGTTGAAGTGGCGAGCGATACA Consensus
##
## aln (2647..2700) names
## [1] CCGCATCCGGCGCGGATTGGCCTGAACTGCCAGCTGGCGCAGGTAGCAGAGCGG V00296.1 E. coli ...
## [2] CCGCATCCGGCGCGGATTGGCCTGAACTGCCAGCTGGCGCAGGTAGCAGAGCGG lacZ
## Con CCGCATCCGGCGCGGATTGGCCTGAACTGCCAGCTGGCGCAGGTAGCAGAGCGG Consensus
##
## aln (2701..2754) names
## [1] GTAAACTGGCTCGGATTAGGGCCGCAAGAAAACTATCCCGACCGCCTTACTGCC V00296.1 E. coli ...
## [2] GTAAACTGGCTCGGATTAGGGCCGCAAGAAAACTATCCCGACCGCCTTACTGCC lacZ
## Con GTAAACTGGCTCGGATTAGGGCCGCAAGAAAACTATCCCGACCGCCTTACTGCC Consensus
##
## aln (2755..2808) names
## [1] GCCTGTTTTGACCGCTGGGATCTGCCATTGTCAGACATGTATACCCCGTACGTC V00296.1 E. coli ...
## [2] GCCTGTTTTGACCGCTGGGATCTGCCATTGTCAGACATGTATACCCCGTACGTC lacZ
## Con GCCTGTTTTGACCGCTGGGATCTGCCATTGTCAGACATGTATACCCCGTACGTC Consensus
##
## aln (2809..2862) names
## [1] TTCCCGAGCGAAAACGGTCTGCGCTGCGGGACGCGCGAATTGAATTATGGCCCA V00296.1 E. coli ...
## [2] TTCCCGAGCGAAAACGGTCTGCGCTGCGGGACGCGCGAATTGAATTATGGCCCA lacZ
## Con TTCCCGAGCGAAAACGGTCTGCGCTGCGGGACGCGCGAATTGAATTATGGCCCA Consensus
##
## aln (2863..2916) names
## [1] CACCAGTGGCGCGGCGACTTCCAGTTCAACATCAGCCGCTACAGTCAACAGCAA V00296.1 E. coli ...
## [2] CACCAGTGGCGCGGCGACTTCCAGTTCAACATCAGCCGCTACAGTCAACAGCAA lacZ
## Con CACCAGTGGCGCGGCGACTTCCAGTTCAACATCAGCCGCTACAGTCAACAGCAA Consensus
##
## aln (2917..2970) names
## [1] CTGATGGAAACCAGCCATCGCCATCTGCTGCACGCGGAAGAAGGCACATGGCTG V00296.1 E. coli ...
## [2] CTGATGGAAACCAGCCATCGCCATCTGCTGCACGCGGAAGAAGGCACATGGCTG lacZ
## Con CTGATGGAAACCAGCCATCGCCATCTGCTGCACGCGGAAGAAGGCACATGGCTG Consensus
##
## aln (2971..3024) names
## [1] AATATCGACGGTTTCCATATGGGGATTGGTGGCGACGACTCCTGGAGCCCGTCA V00296.1 E. coli ...
## [2] AATATCGACGGTTTCCATATGGGGATTGGTGGCGACGACTCCTGGAGCCCGTCA lacZ
## Con AATATCGACGGTTTCCATATGGGGATTGGTGGCGACGACTCCTGGAGCCCGTCA Consensus
##
## aln (3025..3078) names
## [1] GTATCGGCGGAATTCCAGCTGAGCGCCGGTCGCTACCATTACCAGTTGGTCTGG V00296.1 E. coli ...
## [2] GTATCGGCGGAATTACAGCTGAGCGCCGGTCGCTACCATTACCAGTTGGTCTGG lacZ
## Con GTATCGGCGGAATT?CAGCTGAGCGCCGGTCGCTACCATTACCAGTTGGTCTGG Consensus
##
## aln (3079..3096) names
## [1] TGTCAAAAATAATAATAA V00296.1 E. coli ...
## [2] TGTCAAAAATAATAATAA lacZ
## Con TGTCAAAAATAATAATAA Consensus